/[dtapublic]/projs/trunk/shared_source/tcl_base/regc_lex.c
ViewVC logotype

Contents of /projs/trunk/shared_source/tcl_base/regc_lex.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 42 - (show annotations) (download)
Fri Oct 14 01:50:00 2016 UTC (8 years, 2 months ago) by dashley
File MIME type: text/plain
File size: 25927 byte(s)
Move shared source code to commonize.
1 /* $Header: /cvsroot/esrg/sfesrg/esrgpcpj/shared/tcl_base/regc_lex.c,v 1.1.1.1 2001/06/13 04:31:50 dtashley Exp $ */
2
3 /*
4 * lexical analyzer
5 * This file is #included by regcomp.c.
6 *
7 * Copyright (c) 1998, 1999 Henry Spencer. All rights reserved.
8 *
9 * Development of this software was funded, in part, by Cray Research Inc.,
10 * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
11 * Corporation, none of whom are responsible for the results. The author
12 * thanks all of them.
13 *
14 * Redistribution and use in source and binary forms -- with or without
15 * modification -- are permitted for any purpose, provided that
16 * redistributions in source form retain this entire copyright notice and
17 * indicate the origin and nature of any modifications.
18 *
19 * I'd appreciate being given credit for this package in the documentation
20 * of software which uses it, but that is not a requirement.
21 *
22 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
23 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
24 * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
25 * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
26 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
27 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
28 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
29 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
30 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
31 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 *
33 */
34
35 /* scanning macros (know about v) */
36 #define ATEOS() (v->now >= v->stop)
37 #define HAVE(n) (v->stop - v->now >= (n))
38 #define NEXT1(c) (!ATEOS() && *v->now == CHR(c))
39 #define NEXT2(a,b) (HAVE(2) && *v->now == CHR(a) && *(v->now+1) == CHR(b))
40 #define NEXT3(a,b,c) (HAVE(3) && *v->now == CHR(a) && \
41 *(v->now+1) == CHR(b) && \
42 *(v->now+2) == CHR(c))
43 #define SET(c) (v->nexttype = (c))
44 #define SETV(c, n) (v->nexttype = (c), v->nextvalue = (n))
45 #define RET(c) return (SET(c), 1)
46 #define RETV(c, n) return (SETV(c, n), 1)
47 #define FAILW(e) return (ERR(e), 0) /* ERR does SET(EOS) */
48 #define LASTTYPE(t) (v->lasttype == (t))
49
50 /* lexical contexts */
51 #define L_ERE 1 /* mainline ERE/ARE */
52 #define L_BRE 2 /* mainline BRE */
53 #define L_Q 3 /* REG_QUOTE */
54 #define L_EBND 4 /* ERE/ARE bound */
55 #define L_BBND 5 /* BRE bound */
56 #define L_BRACK 6 /* brackets */
57 #define L_CEL 7 /* collating element */
58 #define L_ECL 8 /* equivalence class */
59 #define L_CCL 9 /* character class */
60 #define INTOCON(c) (v->lexcon = (c))
61 #define INCON(con) (v->lexcon == (con))
62
63 /* construct pointer past end of chr array */
64 #define ENDOF(array) ((array) + sizeof(array)/sizeof(chr))
65
66 /*
67 - lexstart - set up lexical stuff, scan leading options
68 ^ static VOID lexstart(struct vars *);
69 */
70 static VOID
71 lexstart(v)
72 struct vars *v;
73 {
74 prefixes(v); /* may turn on new type bits etc. */
75 NOERR();
76
77 if (v->cflags&REG_QUOTE) {
78 assert(!(v->cflags&(REG_ADVANCED|REG_EXPANDED|REG_NEWLINE)));
79 INTOCON(L_Q);
80 } else if (v->cflags&REG_EXTENDED) {
81 assert(!(v->cflags&REG_QUOTE));
82 INTOCON(L_ERE);
83 } else {
84 assert(!(v->cflags&(REG_QUOTE|REG_ADVF)));
85 INTOCON(L_BRE);
86 }
87
88 v->nexttype = EMPTY; /* remember we were at the start */
89 next(v); /* set up the first token */
90 }
91
92 /*
93 - prefixes - implement various special prefixes
94 ^ static VOID prefixes(struct vars *);
95 */
96 static VOID
97 prefixes(v)
98 struct vars *v;
99 {
100 /* literal string doesn't get any of this stuff */
101 if (v->cflags&REG_QUOTE)
102 return;
103
104 /* initial "***" gets special things */
105 if (HAVE(4) && NEXT3('*', '*', '*'))
106 switch (*(v->now + 3)) {
107 case CHR('?'): /* "***?" error, msg shows version */
108 ERR(REG_BADPAT);
109 return; /* proceed no further */
110 break;
111 case CHR('='): /* "***=" shifts to literal string */
112 NOTE(REG_UNONPOSIX);
113 v->cflags |= REG_QUOTE;
114 v->cflags &= ~(REG_ADVANCED|REG_EXPANDED|REG_NEWLINE);
115 v->now += 4;
116 return; /* and there can be no more prefixes */
117 break;
118 case CHR(':'): /* "***:" shifts to AREs */
119 NOTE(REG_UNONPOSIX);
120 v->cflags |= REG_ADVANCED;
121 v->now += 4;
122 break;
123 default: /* otherwise *** is just an error */
124 ERR(REG_BADRPT);
125 return;
126 break;
127 }
128
129 /* BREs and EREs don't get embedded options */
130 if ((v->cflags&REG_ADVANCED) != REG_ADVANCED)
131 return;
132
133 /* embedded options (AREs only) */
134 if (HAVE(3) && NEXT2('(', '?') && iscalpha(*(v->now + 2))) {
135 NOTE(REG_UNONPOSIX);
136 v->now += 2;
137 for (; !ATEOS() && iscalpha(*v->now); v->now++)
138 switch (*v->now) {
139 case CHR('b'): /* BREs (but why???) */
140 v->cflags &= ~(REG_ADVANCED|REG_QUOTE);
141 break;
142 case CHR('c'): /* case sensitive */
143 v->cflags &= ~REG_ICASE;
144 break;
145 case CHR('e'): /* plain EREs */
146 v->cflags |= REG_EXTENDED;
147 v->cflags &= ~(REG_ADVF|REG_QUOTE);
148 break;
149 case CHR('i'): /* case insensitive */
150 v->cflags |= REG_ICASE;
151 break;
152 case CHR('m'): /* Perloid synonym for n */
153 case CHR('n'): /* \n affects ^ $ . [^ */
154 v->cflags |= REG_NEWLINE;
155 break;
156 case CHR('p'): /* ~Perl, \n affects . [^ */
157 v->cflags |= REG_NLSTOP;
158 v->cflags &= ~REG_NLANCH;
159 break;
160 case CHR('q'): /* literal string */
161 v->cflags |= REG_QUOTE;
162 v->cflags &= ~REG_ADVANCED;
163 break;
164 case CHR('s'): /* single line, \n ordinary */
165 v->cflags &= ~REG_NEWLINE;
166 break;
167 case CHR('t'): /* tight syntax */
168 v->cflags &= ~REG_EXPANDED;
169 break;
170 case CHR('w'): /* weird, \n affects ^ $ only */
171 v->cflags &= ~REG_NLSTOP;
172 v->cflags |= REG_NLANCH;
173 break;
174 case CHR('x'): /* expanded syntax */
175 v->cflags |= REG_EXPANDED;
176 break;
177 default:
178 ERR(REG_BADOPT);
179 return;
180 }
181 if (!NEXT1(')')) {
182 ERR(REG_BADOPT);
183 return;
184 }
185 v->now++;
186 if (v->cflags&REG_QUOTE)
187 v->cflags &= ~(REG_EXPANDED|REG_NEWLINE);
188 }
189 }
190
191 /*
192 - lexnest - "call a subroutine", interpolating string at the lexical level
193 * Note, this is not a very general facility. There are a number of
194 * implicit assumptions about what sorts of strings can be subroutines.
195 ^ static VOID lexnest(struct vars *, chr *, chr *);
196 */
197 static VOID
198 lexnest(v, beginp, endp)
199 struct vars *v;
200 chr *beginp; /* start of interpolation */
201 chr *endp; /* one past end of interpolation */
202 {
203 assert(v->savenow == NULL); /* only one level of nesting */
204 v->savenow = v->now;
205 v->savestop = v->stop;
206 v->now = beginp;
207 v->stop = endp;
208 }
209
210 /*
211 * string constants to interpolate as expansions of things like \d
212 */
213 static chr backd[] = { /* \d */
214 CHR('['), CHR('['), CHR(':'),
215 CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'),
216 CHR(':'), CHR(']'), CHR(']')
217 };
218 static chr backD[] = { /* \D */
219 CHR('['), CHR('^'), CHR('['), CHR(':'),
220 CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'),
221 CHR(':'), CHR(']'), CHR(']')
222 };
223 static chr brbackd[] = { /* \d within brackets */
224 CHR('['), CHR(':'),
225 CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'),
226 CHR(':'), CHR(']')
227 };
228 static chr backs[] = { /* \s */
229 CHR('['), CHR('['), CHR(':'),
230 CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'),
231 CHR(':'), CHR(']'), CHR(']')
232 };
233 static chr backS[] = { /* \S */
234 CHR('['), CHR('^'), CHR('['), CHR(':'),
235 CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'),
236 CHR(':'), CHR(']'), CHR(']')
237 };
238 static chr brbacks[] = { /* \s within brackets */
239 CHR('['), CHR(':'),
240 CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'),
241 CHR(':'), CHR(']')
242 };
243 static chr backw[] = { /* \w */
244 CHR('['), CHR('['), CHR(':'),
245 CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'),
246 CHR(':'), CHR(']'), CHR('_'), CHR(']')
247 };
248 static chr backW[] = { /* \W */
249 CHR('['), CHR('^'), CHR('['), CHR(':'),
250 CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'),
251 CHR(':'), CHR(']'), CHR('_'), CHR(']')
252 };
253 static chr brbackw[] = { /* \w within brackets */
254 CHR('['), CHR(':'),
255 CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'),
256 CHR(':'), CHR(']'), CHR('_')
257 };
258
259 /*
260 - lexword - interpolate a bracket expression for word characters
261 * Possibly ought to inquire whether there is a "word" character class.
262 ^ static VOID lexword(struct vars *);
263 */
264 static VOID
265 lexword(v)
266 struct vars *v;
267 {
268 lexnest(v, backw, ENDOF(backw));
269 }
270
271 /*
272 - next - get next token
273 ^ static int next(struct vars *);
274 */
275 static int /* 1 normal, 0 failure */
276 next(v)
277 struct vars *v;
278 {
279 chr c;
280
281 /* errors yield an infinite sequence of failures */
282 if (ISERR())
283 return 0; /* the error has set nexttype to EOS */
284
285 /* remember flavor of last token */
286 v->lasttype = v->nexttype;
287
288 /* REG_BOSONLY */
289 if (v->nexttype == EMPTY && (v->cflags&REG_BOSONLY)) {
290 /* at start of a REG_BOSONLY RE */
291 RETV(SBEGIN, 0); /* same as \A */
292 }
293
294 /* if we're nested and we've hit end, return to outer level */
295 if (v->savenow != NULL && ATEOS()) {
296 v->now = v->savenow;
297 v->stop = v->savestop;
298 v->savenow = v->savestop = NULL;
299 }
300
301 /* skip white space etc. if appropriate (not in literal or []) */
302 if (v->cflags&REG_EXPANDED)
303 switch (v->lexcon) {
304 case L_ERE:
305 case L_BRE:
306 case L_EBND:
307 case L_BBND:
308 skip(v);
309 break;
310 }
311
312 /* handle EOS, depending on context */
313 if (ATEOS()) {
314 switch (v->lexcon) {
315 case L_ERE:
316 case L_BRE:
317 case L_Q:
318 RET(EOS);
319 break;
320 case L_EBND:
321 case L_BBND:
322 FAILW(REG_EBRACE);
323 break;
324 case L_BRACK:
325 case L_CEL:
326 case L_ECL:
327 case L_CCL:
328 FAILW(REG_EBRACK);
329 break;
330 }
331 assert(NOTREACHED);
332 }
333
334 /* okay, time to actually get a character */
335 c = *v->now++;
336
337 /* deal with the easy contexts, punt EREs to code below */
338 switch (v->lexcon) {
339 case L_BRE: /* punt BREs to separate function */
340 return brenext(v, c);
341 break;
342 case L_ERE: /* see below */
343 break;
344 case L_Q: /* literal strings are easy */
345 RETV(PLAIN, c);
346 break;
347 case L_BBND: /* bounds are fairly simple */
348 case L_EBND:
349 switch (c) {
350 case CHR('0'): case CHR('1'): case CHR('2'): case CHR('3'):
351 case CHR('4'): case CHR('5'): case CHR('6'): case CHR('7'):
352 case CHR('8'): case CHR('9'):
353 RETV(DIGIT, (chr)DIGITVAL(c));
354 break;
355 case CHR(','):
356 RET(',');
357 break;
358 case CHR('}'): /* ERE bound ends with } */
359 if (INCON(L_EBND)) {
360 INTOCON(L_ERE);
361 if ((v->cflags&REG_ADVF) && NEXT1('?')) {
362 v->now++;
363 NOTE(REG_UNONPOSIX);
364 RETV('}', 0);
365 }
366 RETV('}', 1);
367 } else
368 FAILW(REG_BADBR);
369 break;
370 case CHR('\\'): /* BRE bound ends with \} */
371 if (INCON(L_BBND) && NEXT1('}')) {
372 v->now++;
373 INTOCON(L_BRE);
374 RET('}');
375 } else
376 FAILW(REG_BADBR);
377 break;
378 default:
379 FAILW(REG_BADBR);
380 break;
381 }
382 assert(NOTREACHED);
383 break;
384 case L_BRACK: /* brackets are not too hard */
385 switch (c) {
386 case CHR(']'):
387 if (LASTTYPE('['))
388 RETV(PLAIN, c);
389 else {
390 INTOCON((v->cflags&REG_EXTENDED) ?
391 L_ERE : L_BRE);
392 RET(']');
393 }
394 break;
395 case CHR('\\'):
396 NOTE(REG_UBBS);
397 if (!(v->cflags&REG_ADVF))
398 RETV(PLAIN, c);
399 NOTE(REG_UNONPOSIX);
400 if (ATEOS())
401 FAILW(REG_EESCAPE);
402 (DISCARD)lexescape(v);
403 switch (v->nexttype) { /* not all escapes okay here */
404 case PLAIN:
405 return 1;
406 break;
407 case CCLASS:
408 switch (v->nextvalue) {
409 case 'd':
410 lexnest(v, brbackd, ENDOF(brbackd));
411 break;
412 case 's':
413 lexnest(v, brbacks, ENDOF(brbacks));
414 break;
415 case 'w':
416 lexnest(v, brbackw, ENDOF(brbackw));
417 break;
418 default:
419 FAILW(REG_EESCAPE);
420 break;
421 }
422 /* lexnest done, back up and try again */
423 v->nexttype = v->lasttype;
424 return next(v);
425 break;
426 }
427 /* not one of the acceptable escapes */
428 FAILW(REG_EESCAPE);
429 break;
430 case CHR('-'):
431 if (LASTTYPE('[') || NEXT1(']'))
432 RETV(PLAIN, c);
433 else
434 RETV(RANGE, c);
435 break;
436 case CHR('['):
437 if (ATEOS())
438 FAILW(REG_EBRACK);
439 switch (*v->now++) {
440 case CHR('.'):
441 INTOCON(L_CEL);
442 /* might or might not be locale-specific */
443 RET(COLLEL);
444 break;
445 case CHR('='):
446 INTOCON(L_ECL);
447 NOTE(REG_ULOCALE);
448 RET(ECLASS);
449 break;
450 case CHR(':'):
451 INTOCON(L_CCL);
452 NOTE(REG_ULOCALE);
453 RET(CCLASS);
454 break;
455 default: /* oops */
456 v->now--;
457 RETV(PLAIN, c);
458 break;
459 }
460 assert(NOTREACHED);
461 break;
462 default:
463 RETV(PLAIN, c);
464 break;
465 }
466 assert(NOTREACHED);
467 break;
468 case L_CEL: /* collating elements are easy */
469 if (c == CHR('.') && NEXT1(']')) {
470 v->now++;
471 INTOCON(L_BRACK);
472 RETV(END, '.');
473 } else
474 RETV(PLAIN, c);
475 break;
476 case L_ECL: /* ditto equivalence classes */
477 if (c == CHR('=') && NEXT1(']')) {
478 v->now++;
479 INTOCON(L_BRACK);
480 RETV(END, '=');
481 } else
482 RETV(PLAIN, c);
483 break;
484 case L_CCL: /* ditto character classes */
485 if (c == CHR(':') && NEXT1(']')) {
486 v->now++;
487 INTOCON(L_BRACK);
488 RETV(END, ':');
489 } else
490 RETV(PLAIN, c);
491 break;
492 default:
493 assert(NOTREACHED);
494 break;
495 }
496
497 /* that got rid of everything except EREs and AREs */
498 assert(INCON(L_ERE));
499
500 /* deal with EREs and AREs, except for backslashes */
501 switch (c) {
502 case CHR('|'):
503 RET('|');
504 break;
505 case CHR('*'):
506 if ((v->cflags&REG_ADVF) && NEXT1('?')) {
507 v->now++;
508 NOTE(REG_UNONPOSIX);
509 RETV('*', 0);
510 }
511 RETV('*', 1);
512 break;
513 case CHR('+'):
514 if ((v->cflags&REG_ADVF) && NEXT1('?')) {
515 v->now++;
516 NOTE(REG_UNONPOSIX);
517 RETV('+', 0);
518 }
519 RETV('+', 1);
520 break;
521 case CHR('?'):
522 if ((v->cflags&REG_ADVF) && NEXT1('?')) {
523 v->now++;
524 NOTE(REG_UNONPOSIX);
525 RETV('?', 0);
526 }
527 RETV('?', 1);
528 break;
529 case CHR('{'): /* bounds start or plain character */
530 if (v->cflags&REG_EXPANDED)
531 skip(v);
532 if (ATEOS() || !iscdigit(*v->now)) {
533 NOTE(REG_UBRACES);
534 NOTE(REG_UUNSPEC);
535 RETV(PLAIN, c);
536 } else {
537 NOTE(REG_UBOUNDS);
538 INTOCON(L_EBND);
539 RET('{');
540 }
541 assert(NOTREACHED);
542 break;
543 case CHR('('): /* parenthesis, or advanced extension */
544 if ((v->cflags&REG_ADVF) && NEXT1('?')) {
545 NOTE(REG_UNONPOSIX);
546 v->now++;
547 switch (*v->now++) {
548 case CHR(':'): /* non-capturing paren */
549 RETV('(', 0);
550 break;
551 case CHR('#'): /* comment */
552 while (!ATEOS() && *v->now != CHR(')'))
553 v->now++;
554 if (!ATEOS())
555 v->now++;
556 assert(v->nexttype == v->lasttype);
557 return next(v);
558 break;
559 case CHR('='): /* positive lookahead */
560 NOTE(REG_ULOOKAHEAD);
561 RETV(LACON, 1);
562 break;
563 case CHR('!'): /* negative lookahead */
564 NOTE(REG_ULOOKAHEAD);
565 RETV(LACON, 0);
566 break;
567 default:
568 FAILW(REG_BADRPT);
569 break;
570 }
571 assert(NOTREACHED);
572 }
573 if (v->cflags&REG_NOSUB)
574 RETV('(', 0); /* all parens non-capturing */
575 else
576 RETV('(', 1);
577 break;
578 case CHR(')'):
579 if (LASTTYPE('(')) {
580 NOTE(REG_UUNSPEC);
581 }
582 RETV(')', c);
583 break;
584 case CHR('['): /* easy except for [[:<:]] and [[:>:]] */
585 if (HAVE(6) && *(v->now+0) == CHR('[') &&
586 *(v->now+1) == CHR(':') &&
587 (*(v->now+2) == CHR('<') ||
588 *(v->now+2) == CHR('>')) &&
589 *(v->now+3) == CHR(':') &&
590 *(v->now+4) == CHR(']') &&
591 *(v->now+5) == CHR(']')) {
592 c = *(v->now+2);
593 v->now += 6;
594 NOTE(REG_UNONPOSIX);
595 RET((c == CHR('<')) ? '<' : '>');
596 }
597 INTOCON(L_BRACK);
598 if (NEXT1('^')) {
599 v->now++;
600 RETV('[', 0);
601 }
602 RETV('[', 1);
603 break;
604 case CHR('.'):
605 RET('.');
606 break;
607 case CHR('^'):
608 RET('^');
609 break;
610 case CHR('$'):
611 RET('$');
612 break;
613 case CHR('\\'): /* mostly punt backslashes to code below */
614 if (ATEOS())
615 FAILW(REG_EESCAPE);
616 break;
617 default: /* ordinary character */
618 RETV(PLAIN, c);
619 break;
620 }
621
622 /* ERE/ARE backslash handling; backslash already eaten */
623 assert(!ATEOS());
624 if (!(v->cflags&REG_ADVF)) { /* only AREs have non-trivial escapes */
625 if (iscalnum(*v->now)) {
626 NOTE(REG_UBSALNUM);
627 NOTE(REG_UUNSPEC);
628 }
629 RETV(PLAIN, *v->now++);
630 }
631 (DISCARD)lexescape(v);
632 if (ISERR())
633 FAILW(REG_EESCAPE);
634 if (v->nexttype == CCLASS) { /* fudge at lexical level */
635 switch (v->nextvalue) {
636 case 'd': lexnest(v, backd, ENDOF(backd)); break;
637 case 'D': lexnest(v, backD, ENDOF(backD)); break;
638 case 's': lexnest(v, backs, ENDOF(backs)); break;
639 case 'S': lexnest(v, backS, ENDOF(backS)); break;
640 case 'w': lexnest(v, backw, ENDOF(backw)); break;
641 case 'W': lexnest(v, backW, ENDOF(backW)); break;
642 default:
643 assert(NOTREACHED);
644 FAILW(REG_ASSERT);
645 break;
646 }
647 /* lexnest done, back up and try again */
648 v->nexttype = v->lasttype;
649 return next(v);
650 }
651 /* otherwise, lexescape has already done the work */
652 return !ISERR();
653 }
654
655 /*
656 - lexescape - parse an ARE backslash escape (backslash already eaten)
657 * Note slightly nonstandard use of the CCLASS type code.
658 ^ static int lexescape(struct vars *);
659 */
660 static int /* not actually used, but convenient for RETV */
661 lexescape(v)
662 struct vars *v;
663 {
664 chr c;
665 static chr alert[] = {
666 CHR('a'), CHR('l'), CHR('e'), CHR('r'), CHR('t')
667 };
668 static chr esc[] = {
669 CHR('E'), CHR('S'), CHR('C')
670 };
671 chr *save;
672
673 assert(v->cflags&REG_ADVF);
674
675 assert(!ATEOS());
676 c = *v->now++;
677 if (!iscalnum(c))
678 RETV(PLAIN, c);
679
680 NOTE(REG_UNONPOSIX);
681 switch (c) {
682 case CHR('a'):
683 RETV(PLAIN, chrnamed(v, alert, ENDOF(alert), CHR('\007')));
684 break;
685 case CHR('A'):
686 RETV(SBEGIN, 0);
687 break;
688 case CHR('b'):
689 RETV(PLAIN, CHR('\b'));
690 break;
691 case CHR('B'):
692 RETV(PLAIN, CHR('\\'));
693 break;
694 case CHR('c'):
695 NOTE(REG_UUNPORT);
696 if (ATEOS())
697 FAILW(REG_EESCAPE);
698 RETV(PLAIN, (chr)(*v->now++ & 037));
699 break;
700 case CHR('d'):
701 NOTE(REG_ULOCALE);
702 RETV(CCLASS, 'd');
703 break;
704 case CHR('D'):
705 NOTE(REG_ULOCALE);
706 RETV(CCLASS, 'D');
707 break;
708 case CHR('e'):
709 NOTE(REG_UUNPORT);
710 RETV(PLAIN, chrnamed(v, esc, ENDOF(esc), CHR('\033')));
711 break;
712 case CHR('f'):
713 RETV(PLAIN, CHR('\f'));
714 break;
715 case CHR('m'):
716 RET('<');
717 break;
718 case CHR('M'):
719 RET('>');
720 break;
721 case CHR('n'):
722 RETV(PLAIN, CHR('\n'));
723 break;
724 case CHR('r'):
725 RETV(PLAIN, CHR('\r'));
726 break;
727 case CHR('s'):
728 NOTE(REG_ULOCALE);
729 RETV(CCLASS, 's');
730 break;
731 case CHR('S'):
732 NOTE(REG_ULOCALE);
733 RETV(CCLASS, 'S');
734 break;
735 case CHR('t'):
736 RETV(PLAIN, CHR('\t'));
737 break;
738 case CHR('u'):
739 c = lexdigits(v, 16, 4, 4);
740 if (ISERR())
741 FAILW(REG_EESCAPE);
742 RETV(PLAIN, c);
743 break;
744 case CHR('U'):
745 c = lexdigits(v, 16, 8, 8);
746 if (ISERR())
747 FAILW(REG_EESCAPE);
748 RETV(PLAIN, c);
749 break;
750 case CHR('v'):
751 RETV(PLAIN, CHR('\v'));
752 break;
753 case CHR('w'):
754 NOTE(REG_ULOCALE);
755 RETV(CCLASS, 'w');
756 break;
757 case CHR('W'):
758 NOTE(REG_ULOCALE);
759 RETV(CCLASS, 'W');
760 break;
761 case CHR('x'):
762 NOTE(REG_UUNPORT);
763 c = lexdigits(v, 16, 1, 255); /* REs >255 long outside spec */
764 if (ISERR())
765 FAILW(REG_EESCAPE);
766 RETV(PLAIN, c);
767 break;
768 case CHR('y'):
769 NOTE(REG_ULOCALE);
770 RETV(WBDRY, 0);
771 break;
772 case CHR('Y'):
773 NOTE(REG_ULOCALE);
774 RETV(NWBDRY, 0);
775 break;
776 case CHR('Z'):
777 RETV(SEND, 0);
778 break;
779 case CHR('1'): case CHR('2'): case CHR('3'): case CHR('4'):
780 case CHR('5'): case CHR('6'): case CHR('7'): case CHR('8'):
781 case CHR('9'):
782 save = v->now;
783 v->now--; /* put first digit back */
784 c = lexdigits(v, 10, 1, 255); /* REs >255 long outside spec */
785 if (ISERR())
786 FAILW(REG_EESCAPE);
787 /* ugly heuristic (first test is "exactly 1 digit?") */
788 if (v->now - save == 0 || (int)c <= v->nsubexp) {
789 NOTE(REG_UBACKREF);
790 RETV(BACKREF, (chr)c);
791 }
792 /* oops, doesn't look like it's a backref after all... */
793 v->now = save;
794 /* and fall through into octal number */
795 case CHR('0'):
796 NOTE(REG_UUNPORT);
797 v->now--; /* put first digit back */
798 c = lexdigits(v, 8, 1, 3);
799 if (ISERR())
800 FAILW(REG_EESCAPE);
801 RETV(PLAIN, c);
802 break;
803 default:
804 assert(iscalpha(c));
805 FAILW(REG_EESCAPE); /* unknown alphabetic escape */
806 break;
807 }
808 assert(NOTREACHED);
809 }
810
811 /*
812 - lexdigits - slurp up digits and return chr value
813 ^ static chr lexdigits(struct vars *, int, int, int);
814 */
815 static chr /* chr value; errors signalled via ERR */
816 lexdigits(v, base, minlen, maxlen)
817 struct vars *v;
818 int base;
819 int minlen;
820 int maxlen;
821 {
822 uchr n; /* unsigned to avoid overflow misbehavior */
823 int len;
824 chr c;
825 int d;
826 CONST uchr ub = (uchr) base;
827
828 n = 0;
829 for (len = 0; len < maxlen && !ATEOS(); len++) {
830 c = *v->now++;
831 switch (c) {
832 case CHR('0'): case CHR('1'): case CHR('2'): case CHR('3'):
833 case CHR('4'): case CHR('5'): case CHR('6'): case CHR('7'):
834 case CHR('8'): case CHR('9'):
835 d = DIGITVAL(c);
836 break;
837 case CHR('a'): case CHR('A'): d = 10; break;
838 case CHR('b'): case CHR('B'): d = 11; break;
839 case CHR('c'): case CHR('C'): d = 12; break;
840 case CHR('d'): case CHR('D'): d = 13; break;
841 case CHR('e'): case CHR('E'): d = 14; break;
842 case CHR('f'): case CHR('F'): d = 15; break;
843 default:
844 v->now--; /* oops, not a digit at all */
845 d = -1;
846 break;
847 }
848
849 if (d >= base) { /* not a plausible digit */
850 v->now--;
851 d = -1;
852 }
853 if (d < 0)
854 break; /* NOTE BREAK OUT */
855 n = n*ub + (uchr)d;
856 }
857 if (len < minlen)
858 ERR(REG_EESCAPE);
859
860 return (chr)n;
861 }
862
863 /*
864 - brenext - get next BRE token
865 * This is much like EREs except for all the stupid backslashes and the
866 * context-dependency of some things.
867 ^ static int brenext(struct vars *, pchr);
868 */
869 static int /* 1 normal, 0 failure */
870 brenext(v, pc)
871 struct vars *v;
872 pchr pc;
873 {
874 chr c = (chr)pc;
875
876 switch (c) {
877 case CHR('*'):
878 if (LASTTYPE(EMPTY) || LASTTYPE('(') || LASTTYPE('^'))
879 RETV(PLAIN, c);
880 RET('*');
881 break;
882 case CHR('['):
883 if (HAVE(6) && *(v->now+0) == CHR('[') &&
884 *(v->now+1) == CHR(':') &&
885 (*(v->now+2) == CHR('<') ||
886 *(v->now+2) == CHR('>')) &&
887 *(v->now+3) == CHR(':') &&
888 *(v->now+4) == CHR(']') &&
889 *(v->now+5) == CHR(']')) {
890 c = *(v->now+2);
891 v->now += 6;
892 NOTE(REG_UNONPOSIX);
893 RET((c == CHR('<')) ? '<' : '>');
894 }
895 INTOCON(L_BRACK);
896 if (NEXT1('^')) {
897 v->now++;
898 RETV('[', 0);
899 }
900 RETV('[', 1);
901 break;
902 case CHR('.'):
903 RET('.');
904 break;
905 case CHR('^'):
906 if (LASTTYPE(EMPTY))
907 RET('^');
908 if (LASTTYPE('(')) {
909 NOTE(REG_UUNSPEC);
910 RET('^');
911 }
912 RETV(PLAIN, c);
913 break;
914 case CHR('$'):
915 if (v->cflags&REG_EXPANDED)
916 skip(v);
917 if (ATEOS())
918 RET('$');
919 if (NEXT2('\\', ')')) {
920 NOTE(REG_UUNSPEC);
921 RET('$');
922 }
923 RETV(PLAIN, c);
924 break;
925 case CHR('\\'):
926 break; /* see below */
927 default:
928 RETV(PLAIN, c);
929 break;
930 }
931
932 assert(c == CHR('\\'));
933
934 if (ATEOS())
935 FAILW(REG_EESCAPE);
936
937 c = *v->now++;
938 switch (c) {
939 case CHR('{'):
940 INTOCON(L_BBND);
941 NOTE(REG_UBOUNDS);
942 RET('{');
943 break;
944 case CHR('('):
945 RETV('(', 1);
946 break;
947 case CHR(')'):
948 RETV(')', c);
949 break;
950 case CHR('<'):
951 NOTE(REG_UNONPOSIX);
952 RET('<');
953 break;
954 case CHR('>'):
955 NOTE(REG_UNONPOSIX);
956 RET('>');
957 break;
958 case CHR('1'): case CHR('2'): case CHR('3'): case CHR('4'):
959 case CHR('5'): case CHR('6'): case CHR('7'): case CHR('8'):
960 case CHR('9'):
961 NOTE(REG_UBACKREF);
962 RETV(BACKREF, (chr)DIGITVAL(c));
963 break;
964 default:
965 if (iscalnum(c)) {
966 NOTE(REG_UBSALNUM);
967 NOTE(REG_UUNSPEC);
968 }
969 RETV(PLAIN, c);
970 break;
971 }
972
973 assert(NOTREACHED);
974 }
975
976 /*
977 - skip - skip white space and comments in expanded form
978 ^ static VOID skip(struct vars *);
979 */
980 static VOID
981 skip(v)
982 struct vars *v;
983 {
984 chr *start = v->now;
985
986 assert(v->cflags&REG_EXPANDED);
987
988 for (;;) {
989 while (!ATEOS() && iscspace(*v->now))
990 v->now++;
991 if (ATEOS() || *v->now != CHR('#'))
992 break; /* NOTE BREAK OUT */
993 assert(NEXT1('#'));
994 while (!ATEOS() && *v->now != CHR('\n'))
995 v->now++;
996 /* leave the newline to be picked up by the iscspace loop */
997 }
998
999 if (v->now != start)
1000 NOTE(REG_UNONPOSIX);
1001 }
1002
1003 /*
1004 - newline - return the chr for a newline
1005 * This helps confine use of CHR to this source file.
1006 ^ static chr newline(NOPARMS);
1007 */
1008 static chr
1009 newline()
1010 {
1011 return CHR('\n');
1012 }
1013
1014 /*
1015 - ch - return the chr sequence for regc_locale.c's fake collating element ch
1016 * This helps confine use of CHR to this source file. Beware that the caller
1017 * knows how long the sequence is.
1018 ^ #ifdef REG_DEBUG
1019 ^ static chr *ch(NOPARMS);
1020 ^ #endif
1021 */
1022 #ifdef REG_DEBUG
1023 static chr *
1024 ch()
1025 {
1026 static chr chstr[] = { CHR('c'), CHR('h'), CHR('\0') };
1027
1028 return chstr;
1029 }
1030 #endif
1031
1032 /*
1033 - chrnamed - return the chr known by a given (chr string) name
1034 * The code is a bit clumsy, but this routine gets only such specialized
1035 * use that it hardly matters.
1036 ^ static chr chrnamed(struct vars *, chr *, chr *, pchr);
1037 */
1038 static chr
1039 chrnamed(v, startp, endp, lastresort)
1040 struct vars *v;
1041 chr *startp; /* start of name */
1042 chr *endp; /* just past end of name */
1043 pchr lastresort; /* what to return if name lookup fails */
1044 {
1045 celt c;
1046 int errsave;
1047 int e;
1048 struct cvec *cv;
1049
1050 errsave = v->err;
1051 v->err = 0;
1052 c = element(v, startp, endp);
1053 e = v->err;
1054 v->err = errsave;
1055
1056 if (e != 0)
1057 return (chr)lastresort;
1058
1059 cv = range(v, c, c, 0);
1060 if (cv->nchrs == 0)
1061 return (chr)lastresort;
1062 return cv->chrs[0];
1063 }
1064
1065 /* $History: regc_lex.c $
1066 *
1067 * ***************** Version 1 *****************
1068 * User: Dtashley Date: 1/02/01 Time: 12:05a
1069 * Created in $/IjuScripter, IjuConsole/Source/Tcl Base
1070 * Initial check-in.
1071 */
1072
1073 /* End of REGC_LEX.C */

dashley@gmail.com
ViewVC Help
Powered by ViewVC 1.1.25