1 |
/* $Header$ */
|
2 |
/*
|
3 |
* re_*comp and friends - compile REs
|
4 |
* This file #includes several others (see the bottom).
|
5 |
*
|
6 |
* Copyright (c) 1998, 1999 Henry Spencer. All rights reserved.
|
7 |
*
|
8 |
* Development of this software was funded, in part, by Cray Research Inc.,
|
9 |
* UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
|
10 |
* Corporation, none of whom are responsible for the results. The author
|
11 |
* thanks all of them.
|
12 |
*
|
13 |
* Redistribution and use in source and binary forms -- with or without
|
14 |
* modification -- are permitted for any purpose, provided that
|
15 |
* redistributions in source form retain this entire copyright notice and
|
16 |
* indicate the origin and nature of any modifications.
|
17 |
*
|
18 |
* I'd appreciate being given credit for this package in the documentation
|
19 |
* of software which uses it, but that is not a requirement.
|
20 |
*
|
21 |
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
|
22 |
* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
|
23 |
* AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
|
24 |
* HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
25 |
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
26 |
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
|
27 |
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
|
28 |
* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
|
29 |
* OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
|
30 |
* ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
31 |
*
|
32 |
*/
|
33 |
|
34 |
#include "regguts.h"
|
35 |
|
36 |
/*
|
37 |
* forward declarations, up here so forward datatypes etc. are defined early
|
38 |
*/
|
39 |
/* =====^!^===== begin forwards =====^!^===== */
|
40 |
/* automatically gathered by fwd; do not hand-edit */
|
41 |
/* === regcomp.c === */
|
42 |
int compile _ANSI_ARGS_((regex_t *, CONST chr *, size_t, int));
|
43 |
static VOID moresubs _ANSI_ARGS_((struct vars *, int));
|
44 |
static int freev _ANSI_ARGS_((struct vars *, int));
|
45 |
static VOID makesearch _ANSI_ARGS_((struct vars *, struct nfa *));
|
46 |
static struct subre *parse _ANSI_ARGS_((struct vars *, int, int, struct state *, struct state *));
|
47 |
static struct subre *parsebranch _ANSI_ARGS_((struct vars *, int, int, struct state *, struct state *, int));
|
48 |
static VOID parseqatom _ANSI_ARGS_((struct vars *, int, int, struct state *, struct state *, struct subre *));
|
49 |
static VOID nonword _ANSI_ARGS_((struct vars *, int, struct state *, struct state *));
|
50 |
static VOID word _ANSI_ARGS_((struct vars *, int, struct state *, struct state *));
|
51 |
static int scannum _ANSI_ARGS_((struct vars *));
|
52 |
static VOID repeat _ANSI_ARGS_((struct vars *, struct state *, struct state *, int, int));
|
53 |
static VOID bracket _ANSI_ARGS_((struct vars *, struct state *, struct state *));
|
54 |
static VOID cbracket _ANSI_ARGS_((struct vars *, struct state *, struct state *));
|
55 |
static VOID brackpart _ANSI_ARGS_((struct vars *, struct state *, struct state *));
|
56 |
static chr *scanplain _ANSI_ARGS_((struct vars *));
|
57 |
static VOID leaders _ANSI_ARGS_((struct vars *, struct cvec *));
|
58 |
static VOID onechr _ANSI_ARGS_((struct vars *, pchr, struct state *, struct state *));
|
59 |
static VOID dovec _ANSI_ARGS_((struct vars *, struct cvec *, struct state *, struct state *));
|
60 |
static celt nextleader _ANSI_ARGS_((struct vars *, pchr, pchr));
|
61 |
static VOID wordchrs _ANSI_ARGS_((struct vars *));
|
62 |
static struct subre *subre _ANSI_ARGS_((struct vars *, int, int, struct state *, struct state *));
|
63 |
static VOID freesubre _ANSI_ARGS_((struct vars *, struct subre *));
|
64 |
static VOID freesrnode _ANSI_ARGS_((struct vars *, struct subre *));
|
65 |
static VOID optst _ANSI_ARGS_((struct vars *, struct subre *));
|
66 |
static int numst _ANSI_ARGS_((struct subre *, int));
|
67 |
static VOID markst _ANSI_ARGS_((struct subre *));
|
68 |
static VOID cleanst _ANSI_ARGS_((struct vars *));
|
69 |
static long nfatree _ANSI_ARGS_((struct vars *, struct subre *, FILE *));
|
70 |
static long nfanode _ANSI_ARGS_((struct vars *, struct subre *, FILE *));
|
71 |
static int newlacon _ANSI_ARGS_((struct vars *, struct state *, struct state *, int));
|
72 |
static VOID freelacons _ANSI_ARGS_((struct subre *, int));
|
73 |
static VOID rfree _ANSI_ARGS_((regex_t *));
|
74 |
static VOID dump _ANSI_ARGS_((regex_t *, FILE *));
|
75 |
static VOID dumpst _ANSI_ARGS_((struct subre *, FILE *, int));
|
76 |
static VOID stdump _ANSI_ARGS_((struct subre *, FILE *, int));
|
77 |
static char *stid _ANSI_ARGS_((struct subre *, char *, size_t));
|
78 |
/* === regc_lex.c === */
|
79 |
static VOID lexstart _ANSI_ARGS_((struct vars *));
|
80 |
static VOID prefixes _ANSI_ARGS_((struct vars *));
|
81 |
static VOID lexnest _ANSI_ARGS_((struct vars *, chr *, chr *));
|
82 |
static VOID lexword _ANSI_ARGS_((struct vars *));
|
83 |
static int next _ANSI_ARGS_((struct vars *));
|
84 |
static int lexescape _ANSI_ARGS_((struct vars *));
|
85 |
static chr lexdigits _ANSI_ARGS_((struct vars *, int, int, int));
|
86 |
static int brenext _ANSI_ARGS_((struct vars *, pchr));
|
87 |
static VOID skip _ANSI_ARGS_((struct vars *));
|
88 |
static chr newline _ANSI_ARGS_((NOPARMS));
|
89 |
#ifdef REG_DEBUG
|
90 |
static chr *ch _ANSI_ARGS_((NOPARMS));
|
91 |
#endif
|
92 |
static chr chrnamed _ANSI_ARGS_((struct vars *, chr *, chr *, pchr));
|
93 |
/* === regc_color.c === */
|
94 |
static VOID initcm _ANSI_ARGS_((struct vars *, struct colormap *));
|
95 |
static VOID freecm _ANSI_ARGS_((struct colormap *));
|
96 |
static VOID cmtreefree _ANSI_ARGS_((struct colormap *, union tree *, int));
|
97 |
static color setcolor _ANSI_ARGS_((struct colormap *, pchr, pcolor));
|
98 |
static color maxcolor _ANSI_ARGS_((struct colormap *));
|
99 |
static color newcolor _ANSI_ARGS_((struct colormap *));
|
100 |
static VOID freecolor _ANSI_ARGS_((struct colormap *, pcolor));
|
101 |
static color pseudocolor _ANSI_ARGS_((struct colormap *));
|
102 |
static color subcolor _ANSI_ARGS_((struct colormap *, pchr c));
|
103 |
static color newsub _ANSI_ARGS_((struct colormap *, pcolor));
|
104 |
static VOID subrange _ANSI_ARGS_((struct vars *, pchr, pchr, struct state *, struct state *));
|
105 |
static VOID subblock _ANSI_ARGS_((struct vars *, pchr, struct state *, struct state *));
|
106 |
static VOID okcolors _ANSI_ARGS_((struct nfa *, struct colormap *));
|
107 |
static VOID colorchain _ANSI_ARGS_((struct colormap *, struct arc *));
|
108 |
static VOID uncolorchain _ANSI_ARGS_((struct colormap *, struct arc *));
|
109 |
static int singleton _ANSI_ARGS_((struct colormap *, pchr c));
|
110 |
static VOID rainbow _ANSI_ARGS_((struct nfa *, struct colormap *, int, pcolor, struct state *, struct state *));
|
111 |
static VOID colorcomplement _ANSI_ARGS_((struct nfa *, struct colormap *, int, struct state *, struct state *, struct state *));
|
112 |
#ifdef REG_DEBUG
|
113 |
static VOID dumpcolors _ANSI_ARGS_((struct colormap *, FILE *));
|
114 |
static VOID fillcheck _ANSI_ARGS_((struct colormap *, union tree *, int, FILE *));
|
115 |
static VOID dumpchr _ANSI_ARGS_((pchr, FILE *));
|
116 |
#endif
|
117 |
/* === regc_nfa.c === */
|
118 |
static struct nfa *newnfa _ANSI_ARGS_((struct vars *, struct colormap *, struct nfa *));
|
119 |
static VOID freenfa _ANSI_ARGS_((struct nfa *));
|
120 |
static struct state *newstate _ANSI_ARGS_((struct nfa *));
|
121 |
static struct state *newfstate _ANSI_ARGS_((struct nfa *, int flag));
|
122 |
static VOID dropstate _ANSI_ARGS_((struct nfa *, struct state *));
|
123 |
static VOID freestate _ANSI_ARGS_((struct nfa *, struct state *));
|
124 |
static VOID destroystate _ANSI_ARGS_((struct nfa *, struct state *));
|
125 |
static VOID newarc _ANSI_ARGS_((struct nfa *, int, pcolor, struct state *, struct state *));
|
126 |
static struct arc *allocarc _ANSI_ARGS_((struct nfa *, struct state *));
|
127 |
static VOID freearc _ANSI_ARGS_((struct nfa *, struct arc *));
|
128 |
static struct arc *findarc _ANSI_ARGS_((struct state *, int, pcolor));
|
129 |
static VOID cparc _ANSI_ARGS_((struct nfa *, struct arc *, struct state *, struct state *));
|
130 |
static VOID moveins _ANSI_ARGS_((struct nfa *, struct state *, struct state *));
|
131 |
static VOID copyins _ANSI_ARGS_((struct nfa *, struct state *, struct state *));
|
132 |
static VOID moveouts _ANSI_ARGS_((struct nfa *, struct state *, struct state *));
|
133 |
static VOID copyouts _ANSI_ARGS_((struct nfa *, struct state *, struct state *));
|
134 |
static VOID cloneouts _ANSI_ARGS_((struct nfa *, struct state *, struct state *, struct state *, int));
|
135 |
static VOID delsub _ANSI_ARGS_((struct nfa *, struct state *, struct state *));
|
136 |
static VOID deltraverse _ANSI_ARGS_((struct nfa *, struct state *, struct state *));
|
137 |
static VOID dupnfa _ANSI_ARGS_((struct nfa *, struct state *, struct state *, struct state *, struct state *));
|
138 |
static VOID duptraverse _ANSI_ARGS_((struct nfa *, struct state *, struct state *));
|
139 |
static VOID cleartraverse _ANSI_ARGS_((struct nfa *, struct state *));
|
140 |
static VOID specialcolors _ANSI_ARGS_((struct nfa *));
|
141 |
static long optimize _ANSI_ARGS_((struct nfa *, FILE *));
|
142 |
static VOID pullback _ANSI_ARGS_((struct nfa *, FILE *));
|
143 |
static int pull _ANSI_ARGS_((struct nfa *, struct arc *));
|
144 |
static VOID pushfwd _ANSI_ARGS_((struct nfa *, FILE *));
|
145 |
static int push _ANSI_ARGS_((struct nfa *, struct arc *));
|
146 |
#define INCOMPATIBLE 1 /* destroys arc */
|
147 |
#define SATISFIED 2 /* constraint satisfied */
|
148 |
#define COMPATIBLE 3 /* compatible but not satisfied yet */
|
149 |
static int combine _ANSI_ARGS_((struct arc *, struct arc *));
|
150 |
static VOID fixempties _ANSI_ARGS_((struct nfa *, FILE *));
|
151 |
static int unempty _ANSI_ARGS_((struct nfa *, struct arc *));
|
152 |
static VOID cleanup _ANSI_ARGS_((struct nfa *));
|
153 |
static VOID markreachable _ANSI_ARGS_((struct nfa *, struct state *, struct state *, struct state *));
|
154 |
static VOID markcanreach _ANSI_ARGS_((struct nfa *, struct state *, struct state *, struct state *));
|
155 |
static long analyze _ANSI_ARGS_((struct nfa *));
|
156 |
static VOID compact _ANSI_ARGS_((struct nfa *, struct cnfa *));
|
157 |
static VOID carcsort _ANSI_ARGS_((struct carc *, struct carc *));
|
158 |
static VOID freecnfa _ANSI_ARGS_((struct cnfa *));
|
159 |
static VOID dumpnfa _ANSI_ARGS_((struct nfa *, FILE *));
|
160 |
#ifdef REG_DEBUG
|
161 |
static VOID dumpstate _ANSI_ARGS_((struct state *, FILE *));
|
162 |
static VOID dumparcs _ANSI_ARGS_((struct state *, FILE *));
|
163 |
static int dumprarcs _ANSI_ARGS_((struct arc *, struct state *, FILE *, int));
|
164 |
static VOID dumparc _ANSI_ARGS_((struct arc *, struct state *, FILE *));
|
165 |
#endif
|
166 |
static VOID dumpcnfa _ANSI_ARGS_((struct cnfa *, FILE *));
|
167 |
#ifdef REG_DEBUG
|
168 |
static VOID dumpcstate _ANSI_ARGS_((int, struct carc *, struct cnfa *, FILE *));
|
169 |
#endif
|
170 |
/* === regc_cvec.c === */
|
171 |
static struct cvec *newcvec _ANSI_ARGS_((int, int, int));
|
172 |
static struct cvec *clearcvec _ANSI_ARGS_((struct cvec *));
|
173 |
static VOID addchr _ANSI_ARGS_((struct cvec *, pchr));
|
174 |
static VOID addrange _ANSI_ARGS_((struct cvec *, pchr, pchr));
|
175 |
static VOID addmcce _ANSI_ARGS_((struct cvec *, chr *, chr *));
|
176 |
static int haschr _ANSI_ARGS_((struct cvec *, pchr));
|
177 |
static struct cvec *getcvec _ANSI_ARGS_((struct vars *, int, int, int));
|
178 |
static VOID freecvec _ANSI_ARGS_((struct cvec *));
|
179 |
/* === regc_locale.c === */
|
180 |
static int nmcces _ANSI_ARGS_((struct vars *));
|
181 |
static int nleaders _ANSI_ARGS_((struct vars *));
|
182 |
static struct cvec *allmcces _ANSI_ARGS_((struct vars *, struct cvec *));
|
183 |
static celt element _ANSI_ARGS_((struct vars *, chr *, chr *));
|
184 |
static struct cvec *range _ANSI_ARGS_((struct vars *, celt, celt, int));
|
185 |
static int before _ANSI_ARGS_((celt, celt));
|
186 |
static struct cvec *eclass _ANSI_ARGS_((struct vars *, celt, int));
|
187 |
static struct cvec *cclass _ANSI_ARGS_((struct vars *, chr *, chr *, int));
|
188 |
static struct cvec *allcases _ANSI_ARGS_((struct vars *, pchr));
|
189 |
static int cmp _ANSI_ARGS_((CONST chr *, CONST chr *, size_t));
|
190 |
static int casecmp _ANSI_ARGS_((CONST chr *, CONST chr *, size_t));
|
191 |
/* automatically gathered by fwd; do not hand-edit */
|
192 |
/* =====^!^===== end forwards =====^!^===== */
|
193 |
|
194 |
|
195 |
|
196 |
/* internal variables, bundled for easy passing around */
|
197 |
struct vars {
|
198 |
regex_t *re;
|
199 |
chr *now; /* scan pointer into string */
|
200 |
chr *stop; /* end of string */
|
201 |
chr *savenow; /* saved now and stop for "subroutine call" */
|
202 |
chr *savestop;
|
203 |
int err; /* error code (0 if none) */
|
204 |
int cflags; /* copy of compile flags */
|
205 |
int lasttype; /* type of previous token */
|
206 |
int nexttype; /* type of next token */
|
207 |
chr nextvalue; /* value (if any) of next token */
|
208 |
int lexcon; /* lexical context type (see lex.c) */
|
209 |
int nsubexp; /* subexpression count */
|
210 |
struct subre **subs; /* subRE pointer vector */
|
211 |
size_t nsubs; /* length of vector */
|
212 |
struct subre *sub10[10]; /* initial vector, enough for most */
|
213 |
struct nfa *nfa; /* the NFA */
|
214 |
struct colormap *cm; /* character color map */
|
215 |
color nlcolor; /* color of newline */
|
216 |
struct state *wordchrs; /* state in nfa holding word-char outarcs */
|
217 |
struct subre *tree; /* subexpression tree */
|
218 |
struct subre *treechain; /* all tree nodes allocated */
|
219 |
struct subre *treefree; /* any free tree nodes */
|
220 |
int ntree; /* number of tree nodes */
|
221 |
struct cvec *cv; /* interface cvec */
|
222 |
struct cvec *cv2; /* utility cvec */
|
223 |
struct cvec *mcces; /* collating-element information */
|
224 |
# define ISCELEADER(v,c) (v->mcces != NULL && haschr(v->mcces, (c)))
|
225 |
struct state *mccepbegin; /* in nfa, start of MCCE prototypes */
|
226 |
struct state *mccepend; /* in nfa, end of MCCE prototypes */
|
227 |
struct subre *lacons; /* lookahead-constraint vector */
|
228 |
int nlacons; /* size of lacons */
|
229 |
};
|
230 |
|
231 |
/* parsing macros; most know that `v' is the struct vars pointer */
|
232 |
#define NEXT() (next(v)) /* advance by one token */
|
233 |
#define SEE(t) (v->nexttype == (t)) /* is next token this? */
|
234 |
#define EAT(t) (SEE(t) && next(v)) /* if next is this, swallow it */
|
235 |
#define VISERR(vv) ((vv)->err != 0) /* have we seen an error yet? */
|
236 |
#define ISERR() VISERR(v)
|
237 |
#define VERR(vv,e) ((vv)->nexttype = EOS, ((vv)->err) ? (vv)->err :\
|
238 |
((vv)->err = (e)))
|
239 |
#define ERR(e) VERR(v, e) /* record an error */
|
240 |
#define NOERR() {if (ISERR()) return;} /* if error seen, return */
|
241 |
#define NOERRN() {if (ISERR()) return NULL;} /* NOERR with retval */
|
242 |
#define NOERRZ() {if (ISERR()) return 0;} /* NOERR with retval */
|
243 |
#define INSIST(c, e) ((c) ? 0 : ERR(e)) /* if condition false, error */
|
244 |
#define NOTE(b) (v->re->re_info |= (b)) /* note visible condition */
|
245 |
#define EMPTYARC(x, y) newarc(v->nfa, EMPTY, 0, x, y)
|
246 |
|
247 |
/* token type codes, some also used as NFA arc types */
|
248 |
#define EMPTY 'n' /* no token present */
|
249 |
#define EOS 'e' /* end of string */
|
250 |
#define PLAIN 'p' /* ordinary character */
|
251 |
#define DIGIT 'd' /* digit (in bound) */
|
252 |
#define BACKREF 'b' /* back reference */
|
253 |
#define COLLEL 'I' /* start of [. */
|
254 |
#define ECLASS 'E' /* start of [= */
|
255 |
#define CCLASS 'C' /* start of [: */
|
256 |
#define END 'X' /* end of [. [= [: */
|
257 |
#define RANGE 'R' /* - within [] which might be range delim. */
|
258 |
#define LACON 'L' /* lookahead constraint subRE */
|
259 |
#define AHEAD 'a' /* color-lookahead arc */
|
260 |
#define BEHIND 'r' /* color-lookbehind arc */
|
261 |
#define WBDRY 'w' /* word boundary constraint */
|
262 |
#define NWBDRY 'W' /* non-word-boundary constraint */
|
263 |
#define SBEGIN 'A' /* beginning of string (even if not BOL) */
|
264 |
#define SEND 'Z' /* end of string (even if not EOL) */
|
265 |
#define PREFER 'P' /* length preference */
|
266 |
|
267 |
/* is an arc colored, and hence on a color chain? */
|
268 |
#define COLORED(a) ((a)->type == PLAIN || (a)->type == AHEAD || \
|
269 |
(a)->type == BEHIND)
|
270 |
|
271 |
|
272 |
|
273 |
/* static function list */
|
274 |
static struct fns functions = {
|
275 |
rfree, /* regfree insides */
|
276 |
};
|
277 |
|
278 |
|
279 |
|
280 |
/*
|
281 |
- compile - compile regular expression
|
282 |
^ int compile(regex_t *, CONST chr *, size_t, int);
|
283 |
*/
|
284 |
int
|
285 |
compile(re, string, len, flags)
|
286 |
regex_t *re;
|
287 |
CONST chr *string;
|
288 |
size_t len;
|
289 |
int flags;
|
290 |
{
|
291 |
struct vars var;
|
292 |
struct vars *v = &var;
|
293 |
struct guts *g;
|
294 |
int i;
|
295 |
size_t j;
|
296 |
FILE *debug = (flags®_PROGRESS) ? stdout : (FILE *)NULL;
|
297 |
# define CNOERR() { if (ISERR()) return freev(v, v->err); }
|
298 |
|
299 |
/* sanity checks */
|
300 |
|
301 |
if (re == NULL || string == NULL)
|
302 |
return REG_INVARG;
|
303 |
if ((flags®_QUOTE) &&
|
304 |
(flags&(REG_ADVANCED|REG_EXPANDED|REG_NEWLINE)))
|
305 |
return REG_INVARG;
|
306 |
if (!(flags®_EXTENDED) && (flags®_ADVF))
|
307 |
return REG_INVARG;
|
308 |
|
309 |
/* initial setup (after which freev() is callable) */
|
310 |
v->re = re;
|
311 |
v->now = (chr *)string;
|
312 |
v->stop = v->now + len;
|
313 |
v->savenow = v->savestop = NULL;
|
314 |
v->err = 0;
|
315 |
v->cflags = flags;
|
316 |
v->nsubexp = 0;
|
317 |
v->subs = v->sub10;
|
318 |
v->nsubs = 10;
|
319 |
for (j = 0; j < v->nsubs; j++)
|
320 |
v->subs[j] = NULL;
|
321 |
v->nfa = NULL;
|
322 |
v->cm = NULL;
|
323 |
v->nlcolor = COLORLESS;
|
324 |
v->wordchrs = NULL;
|
325 |
v->tree = NULL;
|
326 |
v->treechain = NULL;
|
327 |
v->treefree = NULL;
|
328 |
v->cv = NULL;
|
329 |
v->cv2 = NULL;
|
330 |
v->mcces = NULL;
|
331 |
v->lacons = NULL;
|
332 |
v->nlacons = 0;
|
333 |
re->re_magic = REMAGIC;
|
334 |
re->re_info = 0; /* bits get set during parse */
|
335 |
re->re_csize = sizeof(chr);
|
336 |
re->re_guts = NULL;
|
337 |
re->re_fns = VS(&functions);
|
338 |
|
339 |
/* more complex setup, malloced things */
|
340 |
re->re_guts = VS(MALLOC(sizeof(struct guts)));
|
341 |
if (re->re_guts == NULL)
|
342 |
return freev(v, REG_ESPACE);
|
343 |
g = (struct guts *)re->re_guts;
|
344 |
g->tree = NULL;
|
345 |
initcm(v, &g->cmap);
|
346 |
v->cm = &g->cmap;
|
347 |
g->lacons = NULL;
|
348 |
g->nlacons = 0;
|
349 |
ZAPCNFA(g->search);
|
350 |
v->nfa = newnfa(v, v->cm, (struct nfa *)NULL);
|
351 |
CNOERR();
|
352 |
v->cv = newcvec(100, 20, 10);
|
353 |
if (v->cv == NULL)
|
354 |
return freev(v, REG_ESPACE);
|
355 |
i = nmcces(v);
|
356 |
if (i > 0) {
|
357 |
v->mcces = newcvec(nleaders(v), 0, i);
|
358 |
CNOERR();
|
359 |
v->mcces = allmcces(v, v->mcces);
|
360 |
leaders(v, v->mcces);
|
361 |
addmcce(v->mcces, (chr *)NULL, (chr *)NULL); /* dummy */
|
362 |
}
|
363 |
CNOERR();
|
364 |
|
365 |
/* parsing */
|
366 |
lexstart(v); /* also handles prefixes */
|
367 |
if ((v->cflags®_NLSTOP) || (v->cflags®_NLANCH)) {
|
368 |
/* assign newline a unique color */
|
369 |
v->nlcolor = subcolor(v->cm, newline());
|
370 |
okcolors(v->nfa, v->cm);
|
371 |
}
|
372 |
CNOERR();
|
373 |
v->tree = parse(v, EOS, PLAIN, v->nfa->init, v->nfa->final);
|
374 |
assert(SEE(EOS)); /* even if error; ISERR() => SEE(EOS) */
|
375 |
CNOERR();
|
376 |
assert(v->tree != NULL);
|
377 |
|
378 |
/* finish setup of nfa and its subre tree */
|
379 |
specialcolors(v->nfa);
|
380 |
CNOERR();
|
381 |
if (debug != NULL) {
|
382 |
fprintf(debug, "\n\n\n========= RAW ==========\n");
|
383 |
dumpnfa(v->nfa, debug);
|
384 |
dumpst(v->tree, debug, 1);
|
385 |
}
|
386 |
optst(v, v->tree);
|
387 |
v->ntree = numst(v->tree, 1);
|
388 |
markst(v->tree);
|
389 |
cleanst(v);
|
390 |
if (debug != NULL) {
|
391 |
fprintf(debug, "\n\n\n========= TREE FIXED ==========\n");
|
392 |
dumpst(v->tree, debug, 1);
|
393 |
}
|
394 |
|
395 |
/* build compacted NFAs for tree and lacons */
|
396 |
re->re_info |= nfatree(v, v->tree, debug);
|
397 |
CNOERR();
|
398 |
assert(v->nlacons == 0 || v->lacons != NULL);
|
399 |
for (i = 1; i < v->nlacons; i++) {
|
400 |
if (debug != NULL)
|
401 |
fprintf(debug, "\n\n\n========= LA%d ==========\n", i);
|
402 |
nfanode(v, &v->lacons[i], debug);
|
403 |
}
|
404 |
CNOERR();
|
405 |
if (v->tree->flags&SHORTER)
|
406 |
NOTE(REG_USHORTEST);
|
407 |
|
408 |
/* build compacted NFAs for tree, lacons, fast search */
|
409 |
if (debug != NULL)
|
410 |
fprintf(debug, "\n\n\n========= SEARCH ==========\n");
|
411 |
/* can sacrifice main NFA now, so use it as work area */
|
412 |
(DISCARD)optimize(v->nfa, debug);
|
413 |
CNOERR();
|
414 |
makesearch(v, v->nfa);
|
415 |
CNOERR();
|
416 |
compact(v->nfa, &g->search);
|
417 |
CNOERR();
|
418 |
|
419 |
/* looks okay, package it up */
|
420 |
re->re_nsub = v->nsubexp;
|
421 |
v->re = NULL; /* freev no longer frees re */
|
422 |
g->magic = GUTSMAGIC;
|
423 |
g->cflags = v->cflags;
|
424 |
g->info = re->re_info;
|
425 |
g->nsub = re->re_nsub;
|
426 |
g->tree = v->tree;
|
427 |
v->tree = NULL;
|
428 |
g->ntree = v->ntree;
|
429 |
g->compare = (v->cflags®_ICASE) ? casecmp : cmp;
|
430 |
g->lacons = v->lacons;
|
431 |
v->lacons = NULL;
|
432 |
g->nlacons = v->nlacons;
|
433 |
|
434 |
if (flags®_DUMP)
|
435 |
dump(re, stdout);
|
436 |
|
437 |
assert(v->err == 0);
|
438 |
return freev(v, 0);
|
439 |
}
|
440 |
|
441 |
/*
|
442 |
- moresubs - enlarge subRE vector
|
443 |
^ static VOID moresubs(struct vars *, int);
|
444 |
*/
|
445 |
static VOID
|
446 |
moresubs(v, wanted)
|
447 |
struct vars *v;
|
448 |
int wanted; /* want enough room for this one */
|
449 |
{
|
450 |
struct subre **p;
|
451 |
size_t n;
|
452 |
|
453 |
assert(wanted > 0 && (size_t)wanted >= v->nsubs);
|
454 |
n = (size_t)wanted * 3 / 2 + 1;
|
455 |
if (v->subs == v->sub10) {
|
456 |
p = (struct subre **)MALLOC(n * sizeof(struct subre *));
|
457 |
if (p != NULL)
|
458 |
memcpy(VS(p), VS(v->subs),
|
459 |
v->nsubs * sizeof(struct subre *));
|
460 |
} else
|
461 |
p = (struct subre **)REALLOC(v->subs, n*sizeof(struct subre *));
|
462 |
if (p == NULL) {
|
463 |
ERR(REG_ESPACE);
|
464 |
return;
|
465 |
}
|
466 |
v->subs = p;
|
467 |
for (p = &v->subs[v->nsubs]; v->nsubs < n; p++, v->nsubs++)
|
468 |
*p = NULL;
|
469 |
assert(v->nsubs == n);
|
470 |
assert((size_t)wanted < v->nsubs);
|
471 |
}
|
472 |
|
473 |
/*
|
474 |
- freev - free vars struct's substructures where necessary
|
475 |
* Optionally does error-number setting, and always returns error code
|
476 |
* (if any), to make error-handling code terser.
|
477 |
^ static int freev(struct vars *, int);
|
478 |
*/
|
479 |
static int
|
480 |
freev(v, err)
|
481 |
struct vars *v;
|
482 |
int err;
|
483 |
{
|
484 |
if (v->re != NULL)
|
485 |
rfree(v->re);
|
486 |
if (v->subs != v->sub10)
|
487 |
FREE(v->subs);
|
488 |
if (v->nfa != NULL)
|
489 |
freenfa(v->nfa);
|
490 |
if (v->tree != NULL)
|
491 |
freesubre(v, v->tree);
|
492 |
if (v->treechain != NULL)
|
493 |
cleanst(v);
|
494 |
if (v->cv != NULL)
|
495 |
freecvec(v->cv);
|
496 |
if (v->cv2 != NULL)
|
497 |
freecvec(v->cv2);
|
498 |
if (v->mcces != NULL)
|
499 |
freecvec(v->mcces);
|
500 |
if (v->lacons != NULL)
|
501 |
freelacons(v->lacons, v->nlacons);
|
502 |
ERR(err); /* nop if err==0 */
|
503 |
|
504 |
return v->err;
|
505 |
}
|
506 |
|
507 |
/*
|
508 |
- makesearch - turn an NFA into a search NFA (implicit prepend of .*?)
|
509 |
* NFA must have been optimize()d already.
|
510 |
^ static VOID makesearch(struct vars *, struct nfa *);
|
511 |
*/
|
512 |
static VOID
|
513 |
makesearch(v, nfa)
|
514 |
struct vars *v;
|
515 |
struct nfa *nfa;
|
516 |
{
|
517 |
struct arc *a;
|
518 |
struct arc *b;
|
519 |
struct state *pre = nfa->pre;
|
520 |
struct state *s;
|
521 |
struct state *s2;
|
522 |
struct state *slist;
|
523 |
|
524 |
/* no loops are needed if it's anchored */
|
525 |
for (a = pre->outs; a != NULL; a = a->outchain) {
|
526 |
assert(a->type == PLAIN);
|
527 |
if (a->co != nfa->bos[0] && a->co != nfa->bos[1])
|
528 |
break;
|
529 |
}
|
530 |
if (a != NULL) {
|
531 |
/* add implicit .* in front */
|
532 |
rainbow(nfa, v->cm, PLAIN, COLORLESS, pre, pre);
|
533 |
|
534 |
/* and ^* and \A* too -- not always necessary, but harmless */
|
535 |
newarc(nfa, PLAIN, nfa->bos[0], pre, pre);
|
536 |
newarc(nfa, PLAIN, nfa->bos[1], pre, pre);
|
537 |
}
|
538 |
|
539 |
/*
|
540 |
* Now here's the subtle part. Because many REs have no lookback
|
541 |
* constraints, often knowing when you were in the pre state tells
|
542 |
* you little; it's the next state(s) that are informative. But
|
543 |
* some of them may have other inarcs, i.e. it may be possible to
|
544 |
* make actual progress and then return to one of them. We must
|
545 |
* de-optimize such cases, splitting each such state into progress
|
546 |
* and no-progress states.
|
547 |
*/
|
548 |
|
549 |
/* first, make a list of the states */
|
550 |
slist = NULL;
|
551 |
for (a = pre->outs; a != NULL; a = a->outchain) {
|
552 |
s = a->to;
|
553 |
for (b = s->ins; b != NULL; b = b->inchain)
|
554 |
if (b->from != pre)
|
555 |
break;
|
556 |
if (b != NULL) { /* must be split */
|
557 |
s->tmp = slist;
|
558 |
slist = s;
|
559 |
}
|
560 |
}
|
561 |
|
562 |
/* do the splits */
|
563 |
for (s = slist; s != NULL; s = s2) {
|
564 |
s2 = newstate(nfa);
|
565 |
copyouts(nfa, s, s2);
|
566 |
for (a = s->ins; a != NULL; a = b) {
|
567 |
b = a->inchain;
|
568 |
if (a->from != pre) {
|
569 |
cparc(nfa, a, a->from, s2);
|
570 |
freearc(nfa, a);
|
571 |
}
|
572 |
}
|
573 |
s2 = s->tmp;
|
574 |
s->tmp = NULL; /* clean up while we're at it */
|
575 |
}
|
576 |
}
|
577 |
|
578 |
/*
|
579 |
- parse - parse an RE
|
580 |
* This is actually just the top level, which parses a bunch of branches
|
581 |
* tied together with '|'. They appear in the tree as the left children
|
582 |
* of a chain of '|' subres.
|
583 |
^ static struct subre *parse(struct vars *, int, int, struct state *,
|
584 |
^ struct state *);
|
585 |
*/
|
586 |
static struct subre *
|
587 |
parse(v, stopper, type, init, final)
|
588 |
struct vars *v;
|
589 |
int stopper; /* EOS or ')' */
|
590 |
int type; /* LACON (lookahead subRE) or PLAIN */
|
591 |
struct state *init; /* initial state */
|
592 |
struct state *final; /* final state */
|
593 |
{
|
594 |
struct state *left; /* scaffolding for branch */
|
595 |
struct state *right;
|
596 |
struct subre *branches; /* top level */
|
597 |
struct subre *branch; /* current branch */
|
598 |
struct subre *t; /* temporary */
|
599 |
int firstbranch; /* is this the first branch? */
|
600 |
|
601 |
assert(stopper == ')' || stopper == EOS);
|
602 |
|
603 |
branches = subre(v, '|', LONGER, init, final);
|
604 |
NOERRN();
|
605 |
branch = branches;
|
606 |
firstbranch = 1;
|
607 |
do { /* a branch */
|
608 |
if (!firstbranch) {
|
609 |
/* need a place to hang it */
|
610 |
branch->right = subre(v, '|', LONGER, init, final);
|
611 |
NOERRN();
|
612 |
branch = branch->right;
|
613 |
}
|
614 |
firstbranch = 0;
|
615 |
left = newstate(v->nfa);
|
616 |
right = newstate(v->nfa);
|
617 |
NOERRN();
|
618 |
EMPTYARC(init, left);
|
619 |
EMPTYARC(right, final);
|
620 |
NOERRN();
|
621 |
branch->left = parsebranch(v, stopper, type, left, right, 0);
|
622 |
NOERRN();
|
623 |
branch->flags |= UP(branch->flags | branch->left->flags);
|
624 |
if ((branch->flags &~ branches->flags) != 0) /* new flags */
|
625 |
for (t = branches; t != branch; t = t->right)
|
626 |
t->flags |= branch->flags;
|
627 |
} while (EAT('|'));
|
628 |
assert(SEE(stopper) || SEE(EOS));
|
629 |
|
630 |
if (!SEE(stopper)) {
|
631 |
assert(stopper == ')' && SEE(EOS));
|
632 |
ERR(REG_EPAREN);
|
633 |
}
|
634 |
|
635 |
/* optimize out simple cases */
|
636 |
if (branch == branches) { /* only one branch */
|
637 |
assert(branch->right == NULL);
|
638 |
t = branch->left;
|
639 |
branch->left = NULL;
|
640 |
freesubre(v, branches);
|
641 |
branches = t;
|
642 |
} else if (!MESSY(branches->flags)) { /* no interesting innards */
|
643 |
freesubre(v, branches->left);
|
644 |
branches->left = NULL;
|
645 |
freesubre(v, branches->right);
|
646 |
branches->right = NULL;
|
647 |
branches->op = '=';
|
648 |
}
|
649 |
|
650 |
return branches;
|
651 |
}
|
652 |
|
653 |
/*
|
654 |
- parsebranch - parse one branch of an RE
|
655 |
* This mostly manages concatenation, working closely with parseqatom().
|
656 |
* Concatenated things are bundled up as much as possible, with separate
|
657 |
* ',' nodes introduced only when necessary due to substructure.
|
658 |
^ static struct subre *parsebranch(struct vars *, int, int, struct state *,
|
659 |
^ struct state *, int);
|
660 |
*/
|
661 |
static struct subre *
|
662 |
parsebranch(v, stopper, type, left, right, partial)
|
663 |
struct vars *v;
|
664 |
int stopper; /* EOS or ')' */
|
665 |
int type; /* LACON (lookahead subRE) or PLAIN */
|
666 |
struct state *left; /* leftmost state */
|
667 |
struct state *right; /* rightmost state */
|
668 |
int partial; /* is this only part of a branch? */
|
669 |
{
|
670 |
struct state *lp; /* left end of current construct */
|
671 |
int seencontent; /* is there anything in this branch yet? */
|
672 |
struct subre *t;
|
673 |
|
674 |
lp = left;
|
675 |
seencontent = 0;
|
676 |
t = subre(v, '=', 0, left, right); /* op '=' is tentative */
|
677 |
NOERRN();
|
678 |
while (!SEE('|') && !SEE(stopper) && !SEE(EOS)) {
|
679 |
if (seencontent) { /* implicit concat operator */
|
680 |
lp = newstate(v->nfa);
|
681 |
NOERRN();
|
682 |
moveins(v->nfa, right, lp);
|
683 |
}
|
684 |
seencontent = 1;
|
685 |
|
686 |
/* NB, recursion in parseqatom() may swallow rest of branch */
|
687 |
parseqatom(v, stopper, type, lp, right, t);
|
688 |
}
|
689 |
|
690 |
if (!seencontent) { /* empty branch */
|
691 |
if (!partial)
|
692 |
NOTE(REG_UUNSPEC);
|
693 |
assert(lp == left);
|
694 |
EMPTYARC(left, right);
|
695 |
}
|
696 |
|
697 |
return t;
|
698 |
}
|
699 |
|
700 |
/*
|
701 |
- parseqatom - parse one quantified atom or constraint of an RE
|
702 |
* The bookkeeping near the end cooperates very closely with parsebranch();
|
703 |
* in particular, it contains a recursion that can involve parsing the rest
|
704 |
* of the branch, making this function's name somewhat inaccurate.
|
705 |
^ static VOID parseqatom(struct vars *, int, int, struct state *,
|
706 |
^ struct state *, struct subre *);
|
707 |
*/
|
708 |
static VOID
|
709 |
parseqatom(v, stopper, type, lp, rp, top)
|
710 |
struct vars *v;
|
711 |
int stopper; /* EOS or ')' */
|
712 |
int type; /* LACON (lookahead subRE) or PLAIN */
|
713 |
struct state *lp; /* left state to hang it on */
|
714 |
struct state *rp; /* right state to hang it on */
|
715 |
struct subre *top; /* subtree top */
|
716 |
{
|
717 |
struct state *s; /* temporaries for new states */
|
718 |
struct state *s2;
|
719 |
# define ARCV(t, val) newarc(v->nfa, t, val, lp, rp)
|
720 |
int m, n;
|
721 |
struct subre *atom; /* atom's subtree */
|
722 |
struct subre *t;
|
723 |
int cap; /* capturing parens? */
|
724 |
int pos; /* positive lookahead? */
|
725 |
int subno; /* capturing-parens or backref number */
|
726 |
int atomtype;
|
727 |
int qprefer; /* quantifier short/long preference */
|
728 |
int f;
|
729 |
struct subre **atomp; /* where the pointer to atom is */
|
730 |
|
731 |
/* initial bookkeeping */
|
732 |
atom = NULL;
|
733 |
assert(lp->nouts == 0); /* must string new code */
|
734 |
assert(rp->nins == 0); /* between lp and rp */
|
735 |
subno = 0; /* just to shut lint up */
|
736 |
|
737 |
/* an atom or constraint... */
|
738 |
atomtype = v->nexttype;
|
739 |
switch (atomtype) {
|
740 |
/* first, constraints, which end by returning */
|
741 |
case '^':
|
742 |
ARCV('^', 1);
|
743 |
if (v->cflags®_NLANCH)
|
744 |
ARCV(BEHIND, v->nlcolor);
|
745 |
NEXT();
|
746 |
return;
|
747 |
break;
|
748 |
case '$':
|
749 |
ARCV('$', 1);
|
750 |
if (v->cflags®_NLANCH)
|
751 |
ARCV(AHEAD, v->nlcolor);
|
752 |
NEXT();
|
753 |
return;
|
754 |
break;
|
755 |
case SBEGIN:
|
756 |
ARCV('^', 1); /* BOL */
|
757 |
ARCV('^', 0); /* or BOS */
|
758 |
NEXT();
|
759 |
return;
|
760 |
break;
|
761 |
case SEND:
|
762 |
ARCV('$', 1); /* EOL */
|
763 |
ARCV('$', 0); /* or EOS */
|
764 |
NEXT();
|
765 |
return;
|
766 |
break;
|
767 |
case '<':
|
768 |
wordchrs(v); /* does NEXT() */
|
769 |
s = newstate(v->nfa);
|
770 |
NOERR();
|
771 |
nonword(v, BEHIND, lp, s);
|
772 |
word(v, AHEAD, s, rp);
|
773 |
return;
|
774 |
break;
|
775 |
case '>':
|
776 |
wordchrs(v); /* does NEXT() */
|
777 |
s = newstate(v->nfa);
|
778 |
NOERR();
|
779 |
word(v, BEHIND, lp, s);
|
780 |
nonword(v, AHEAD, s, rp);
|
781 |
return;
|
782 |
break;
|
783 |
case WBDRY:
|
784 |
wordchrs(v); /* does NEXT() */
|
785 |
s = newstate(v->nfa);
|
786 |
NOERR();
|
787 |
nonword(v, BEHIND, lp, s);
|
788 |
word(v, AHEAD, s, rp);
|
789 |
s = newstate(v->nfa);
|
790 |
NOERR();
|
791 |
word(v, BEHIND, lp, s);
|
792 |
nonword(v, AHEAD, s, rp);
|
793 |
return;
|
794 |
break;
|
795 |
case NWBDRY:
|
796 |
wordchrs(v); /* does NEXT() */
|
797 |
s = newstate(v->nfa);
|
798 |
NOERR();
|
799 |
word(v, BEHIND, lp, s);
|
800 |
word(v, AHEAD, s, rp);
|
801 |
s = newstate(v->nfa);
|
802 |
NOERR();
|
803 |
nonword(v, BEHIND, lp, s);
|
804 |
nonword(v, AHEAD, s, rp);
|
805 |
return;
|
806 |
break;
|
807 |
case LACON: /* lookahead constraint */
|
808 |
pos = v->nextvalue;
|
809 |
NEXT();
|
810 |
s = newstate(v->nfa);
|
811 |
s2 = newstate(v->nfa);
|
812 |
NOERR();
|
813 |
t = parse(v, ')', LACON, s, s2);
|
814 |
freesubre(v, t); /* internal structure irrelevant */
|
815 |
assert(SEE(')') || ISERR());
|
816 |
NEXT();
|
817 |
n = newlacon(v, s, s2, pos);
|
818 |
NOERR();
|
819 |
ARCV(LACON, n);
|
820 |
return;
|
821 |
break;
|
822 |
/* then errors, to get them out of the way */
|
823 |
case '*':
|
824 |
case '+':
|
825 |
case '?':
|
826 |
case '{':
|
827 |
ERR(REG_BADRPT);
|
828 |
return;
|
829 |
break;
|
830 |
default:
|
831 |
ERR(REG_ASSERT);
|
832 |
return;
|
833 |
break;
|
834 |
/* then plain characters, and minor variants on that theme */
|
835 |
case ')': /* unbalanced paren */
|
836 |
if ((v->cflags®_ADVANCED) != REG_EXTENDED) {
|
837 |
ERR(REG_EPAREN);
|
838 |
return;
|
839 |
}
|
840 |
/* legal in EREs due to specification botch */
|
841 |
NOTE(REG_UPBOTCH);
|
842 |
/* fallthrough into case PLAIN */
|
843 |
case PLAIN:
|
844 |
onechr(v, v->nextvalue, lp, rp);
|
845 |
okcolors(v->nfa, v->cm);
|
846 |
NOERR();
|
847 |
NEXT();
|
848 |
break;
|
849 |
case '[':
|
850 |
if (v->nextvalue == 1)
|
851 |
bracket(v, lp, rp);
|
852 |
else
|
853 |
cbracket(v, lp, rp);
|
854 |
assert(SEE(']') || ISERR());
|
855 |
NEXT();
|
856 |
break;
|
857 |
case '.':
|
858 |
rainbow(v->nfa, v->cm, PLAIN,
|
859 |
(v->cflags®_NLSTOP) ? v->nlcolor : COLORLESS,
|
860 |
lp, rp);
|
861 |
NEXT();
|
862 |
break;
|
863 |
/* and finally the ugly stuff */
|
864 |
case '(': /* value flags as capturing or non */
|
865 |
cap = (type == LACON) ? 0 : v->nextvalue;
|
866 |
if (cap) {
|
867 |
v->nsubexp++;
|
868 |
subno = v->nsubexp;
|
869 |
if ((size_t)subno >= v->nsubs)
|
870 |
moresubs(v, subno);
|
871 |
assert((size_t)subno < v->nsubs);
|
872 |
} else
|
873 |
atomtype = PLAIN; /* something that's not '(' */
|
874 |
NEXT();
|
875 |
/* need new endpoints because tree will contain pointers */
|
876 |
s = newstate(v->nfa);
|
877 |
s2 = newstate(v->nfa);
|
878 |
NOERR();
|
879 |
EMPTYARC(lp, s);
|
880 |
EMPTYARC(s2, rp);
|
881 |
NOERR();
|
882 |
atom = parse(v, ')', PLAIN, s, s2);
|
883 |
assert(SEE(')') || ISERR());
|
884 |
NEXT();
|
885 |
NOERR();
|
886 |
if (cap) {
|
887 |
v->subs[subno] = atom;
|
888 |
t = subre(v, '(', atom->flags|CAP, lp, rp);
|
889 |
NOERR();
|
890 |
t->subno = subno;
|
891 |
t->left = atom;
|
892 |
atom = t;
|
893 |
}
|
894 |
/* postpone everything else pending possible {0} */
|
895 |
break;
|
896 |
case BACKREF: /* the Feature From The Black Lagoon */
|
897 |
INSIST(type != LACON, REG_ESUBREG);
|
898 |
INSIST(v->nextvalue < v->nsubs, REG_ESUBREG);
|
899 |
INSIST(v->subs[v->nextvalue] != NULL, REG_ESUBREG);
|
900 |
NOERR();
|
901 |
assert(v->nextvalue > 0);
|
902 |
atom = subre(v, 'b', BACKR, lp, rp);
|
903 |
subno = v->nextvalue;
|
904 |
atom->subno = subno;
|
905 |
EMPTYARC(lp, rp); /* temporarily, so there's something */
|
906 |
NEXT();
|
907 |
break;
|
908 |
}
|
909 |
|
910 |
/* ...and an atom may be followed by a quantifier */
|
911 |
switch (v->nexttype) {
|
912 |
case '*':
|
913 |
m = 0;
|
914 |
n = INFINITY;
|
915 |
qprefer = (v->nextvalue) ? LONGER : SHORTER;
|
916 |
NEXT();
|
917 |
break;
|
918 |
case '+':
|
919 |
m = 1;
|
920 |
n = INFINITY;
|
921 |
qprefer = (v->nextvalue) ? LONGER : SHORTER;
|
922 |
NEXT();
|
923 |
break;
|
924 |
case '?':
|
925 |
m = 0;
|
926 |
n = 1;
|
927 |
qprefer = (v->nextvalue) ? LONGER : SHORTER;
|
928 |
NEXT();
|
929 |
break;
|
930 |
case '{':
|
931 |
NEXT();
|
932 |
m = scannum(v);
|
933 |
if (EAT(',')) {
|
934 |
if (SEE(DIGIT))
|
935 |
n = scannum(v);
|
936 |
else
|
937 |
n = INFINITY;
|
938 |
if (m > n) {
|
939 |
ERR(REG_BADBR);
|
940 |
return;
|
941 |
}
|
942 |
/* {m,n} exercises preference, even if it's {m,m} */
|
943 |
qprefer = (v->nextvalue) ? LONGER : SHORTER;
|
944 |
} else {
|
945 |
n = m;
|
946 |
/* {m} passes operand's preference through */
|
947 |
qprefer = 0;
|
948 |
}
|
949 |
if (!SEE('}')) { /* catches errors too */
|
950 |
ERR(REG_BADBR);
|
951 |
return;
|
952 |
}
|
953 |
NEXT();
|
954 |
break;
|
955 |
default: /* no quantifier */
|
956 |
m = n = 1;
|
957 |
qprefer = 0;
|
958 |
break;
|
959 |
}
|
960 |
|
961 |
/* annoying special case: {0} or {0,0} cancels everything */
|
962 |
if (m == 0 && n == 0) {
|
963 |
if (atom != NULL)
|
964 |
freesubre(v, atom);
|
965 |
if (atomtype == '(')
|
966 |
v->subs[subno] = NULL;
|
967 |
delsub(v->nfa, lp, rp);
|
968 |
EMPTYARC(lp, rp);
|
969 |
return;
|
970 |
}
|
971 |
|
972 |
/* if not a messy case, avoid hard part */
|
973 |
assert(!MESSY(top->flags));
|
974 |
f = top->flags | qprefer | ((atom != NULL) ? atom->flags : 0);
|
975 |
if (atomtype != '(' && atomtype != BACKREF && !MESSY(UP(f))) {
|
976 |
if (!(m == 1 && n == 1))
|
977 |
repeat(v, lp, rp, m, n);
|
978 |
if (atom != NULL)
|
979 |
freesubre(v, atom);
|
980 |
top->flags = f;
|
981 |
return;
|
982 |
}
|
983 |
|
984 |
/*
|
985 |
* hard part: something messy
|
986 |
* That is, capturing parens, back reference, short/long clash, or
|
987 |
* an atom with substructure containing one of those.
|
988 |
*/
|
989 |
|
990 |
/* now we'll need a subre for the contents even if they're boring */
|
991 |
if (atom == NULL) {
|
992 |
atom = subre(v, '=', 0, lp, rp);
|
993 |
NOERR();
|
994 |
}
|
995 |
|
996 |
/*
|
997 |
* prepare a general-purpose state skeleton
|
998 |
*
|
999 |
* ---> [s] ---prefix---> [begin] ---atom---> [end] ----rest---> [rp]
|
1000 |
* / /
|
1001 |
* [lp] ----> [s2] ----bypass---------------------
|
1002 |
*
|
1003 |
* where bypass is an empty, and prefix is some repetitions of atom
|
1004 |
*/
|
1005 |
s = newstate(v->nfa); /* first, new endpoints for the atom */
|
1006 |
s2 = newstate(v->nfa);
|
1007 |
NOERR();
|
1008 |
moveouts(v->nfa, lp, s);
|
1009 |
moveins(v->nfa, rp, s2);
|
1010 |
NOERR();
|
1011 |
atom->begin = s;
|
1012 |
atom->end = s2;
|
1013 |
s = newstate(v->nfa); /* and spots for prefix and bypass */
|
1014 |
s2 = newstate(v->nfa);
|
1015 |
NOERR();
|
1016 |
EMPTYARC(lp, s);
|
1017 |
EMPTYARC(lp, s2);
|
1018 |
NOERR();
|
1019 |
|
1020 |
/* break remaining subRE into x{...} and what follows */
|
1021 |
t = subre(v, '.', COMBINE(qprefer, atom->flags), lp, rp);
|
1022 |
t->left = atom;
|
1023 |
atomp = &t->left;
|
1024 |
/* here we should recurse... but we must postpone that to the end */
|
1025 |
|
1026 |
/* split top into prefix and remaining */
|
1027 |
assert(top->op == '=' && top->left == NULL && top->right == NULL);
|
1028 |
top->left = subre(v, '=', top->flags, top->begin, lp);
|
1029 |
top->op = '.';
|
1030 |
top->right = t;
|
1031 |
|
1032 |
/* if it's a backref, now is the time to replicate the subNFA */
|
1033 |
if (atomtype == BACKREF) {
|
1034 |
assert(atom->begin->nouts == 1); /* just the EMPTY */
|
1035 |
delsub(v->nfa, atom->begin, atom->end);
|
1036 |
assert(v->subs[subno] != NULL);
|
1037 |
/* and here's why the recursion got postponed: it must */
|
1038 |
/* wait until the skeleton is filled in, because it may */
|
1039 |
/* hit a backref that wants to copy the filled-in skeleton */
|
1040 |
dupnfa(v->nfa, v->subs[subno]->begin, v->subs[subno]->end,
|
1041 |
atom->begin, atom->end);
|
1042 |
NOERR();
|
1043 |
}
|
1044 |
|
1045 |
/* it's quantifier time; first, turn x{0,...} into x{1,...}|empty */
|
1046 |
if (m == 0) {
|
1047 |
EMPTYARC(s2, atom->end); /* the bypass */
|
1048 |
assert(PREF(qprefer) != 0);
|
1049 |
f = COMBINE(qprefer, atom->flags);
|
1050 |
t = subre(v, '|', f, lp, atom->end);
|
1051 |
NOERR();
|
1052 |
t->left = atom;
|
1053 |
t->right = subre(v, '|', PREF(f), s2, atom->end);
|
1054 |
NOERR();
|
1055 |
t->right->left = subre(v, '=', 0, s2, atom->end);
|
1056 |
NOERR();
|
1057 |
*atomp = t;
|
1058 |
atomp = &t->left;
|
1059 |
m = 1;
|
1060 |
}
|
1061 |
|
1062 |
/* deal with the rest of the quantifier */
|
1063 |
if (atomtype == BACKREF) {
|
1064 |
/* special case: backrefs have internal quantifiers */
|
1065 |
EMPTYARC(s, atom->begin); /* empty prefix */
|
1066 |
/* just stuff everything into atom */
|
1067 |
repeat(v, atom->begin, atom->end, m, n);
|
1068 |
atom->min = (short)m;
|
1069 |
atom->max = (short)n;
|
1070 |
atom->flags |= COMBINE(qprefer, atom->flags);
|
1071 |
} else if (m == 1 && n == 1) {
|
1072 |
/* no/vacuous quantifier: done */
|
1073 |
EMPTYARC(s, atom->begin); /* empty prefix */
|
1074 |
} else {
|
1075 |
/* turn x{m,n} into x{m-1,n-1}x, with capturing */
|
1076 |
/* parens in only second x */
|
1077 |
dupnfa(v->nfa, atom->begin, atom->end, s, atom->begin);
|
1078 |
assert(m >= 1 && m != INFINITY && n >= 1);
|
1079 |
repeat(v, s, atom->begin, m-1, (n == INFINITY) ? n : n-1);
|
1080 |
f = COMBINE(qprefer, atom->flags);
|
1081 |
t = subre(v, '.', f, s, atom->end); /* prefix and atom */
|
1082 |
NOERR();
|
1083 |
t->left = subre(v, '=', PREF(f), s, atom->begin);
|
1084 |
NOERR();
|
1085 |
t->right = atom;
|
1086 |
*atomp = t;
|
1087 |
}
|
1088 |
|
1089 |
/* and finally, look after that postponed recursion */
|
1090 |
t = top->right;
|
1091 |
if (!(SEE('|') || SEE(stopper) || SEE(EOS)))
|
1092 |
t->right = parsebranch(v, stopper, type, atom->end, rp, 1);
|
1093 |
else {
|
1094 |
EMPTYARC(atom->end, rp);
|
1095 |
t->right = subre(v, '=', 0, atom->end, rp);
|
1096 |
}
|
1097 |
assert(SEE('|') || SEE(stopper) || SEE(EOS));
|
1098 |
t->flags |= COMBINE(t->flags, t->right->flags);
|
1099 |
top->flags |= COMBINE(top->flags, t->flags);
|
1100 |
}
|
1101 |
|
1102 |
/*
|
1103 |
- nonword - generate arcs for non-word-character ahead or behind
|
1104 |
^ static VOID nonword(struct vars *, int, struct state *, struct state *);
|
1105 |
*/
|
1106 |
static VOID
|
1107 |
nonword(v, dir, lp, rp)
|
1108 |
struct vars *v;
|
1109 |
int dir; /* AHEAD or BEHIND */
|
1110 |
struct state *lp;
|
1111 |
struct state *rp;
|
1112 |
{
|
1113 |
int anchor = (dir == AHEAD) ? '$' : '^';
|
1114 |
|
1115 |
assert(dir == AHEAD || dir == BEHIND);
|
1116 |
newarc(v->nfa, anchor, 1, lp, rp);
|
1117 |
newarc(v->nfa, anchor, 0, lp, rp);
|
1118 |
colorcomplement(v->nfa, v->cm, dir, v->wordchrs, lp, rp);
|
1119 |
/* (no need for special attention to \n) */
|
1120 |
}
|
1121 |
|
1122 |
/*
|
1123 |
- word - generate arcs for word character ahead or behind
|
1124 |
^ static VOID word(struct vars *, int, struct state *, struct state *);
|
1125 |
*/
|
1126 |
static VOID
|
1127 |
word(v, dir, lp, rp)
|
1128 |
struct vars *v;
|
1129 |
int dir; /* AHEAD or BEHIND */
|
1130 |
struct state *lp;
|
1131 |
struct state *rp;
|
1132 |
{
|
1133 |
assert(dir == AHEAD || dir == BEHIND);
|
1134 |
cloneouts(v->nfa, v->wordchrs, lp, rp, dir);
|
1135 |
/* (no need for special attention to \n) */
|
1136 |
}
|
1137 |
|
1138 |
/*
|
1139 |
- scannum - scan a number
|
1140 |
^ static int scannum(struct vars *);
|
1141 |
*/
|
1142 |
static int /* value, <= DUPMAX */
|
1143 |
scannum(v)
|
1144 |
struct vars *v;
|
1145 |
{
|
1146 |
int n = 0;
|
1147 |
|
1148 |
while (SEE(DIGIT) && n < DUPMAX) {
|
1149 |
n = n*10 + v->nextvalue;
|
1150 |
NEXT();
|
1151 |
}
|
1152 |
if (SEE(DIGIT) || n > DUPMAX) {
|
1153 |
ERR(REG_BADBR);
|
1154 |
return 0;
|
1155 |
}
|
1156 |
return n;
|
1157 |
}
|
1158 |
|
1159 |
/*
|
1160 |
- repeat - replicate subNFA for quantifiers
|
1161 |
* The duplication sequences used here are chosen carefully so that any
|
1162 |
* pointers starting out pointing into the subexpression end up pointing into
|
1163 |
* the last occurrence. (Note that it may not be strung between the same
|
1164 |
* left and right end states, however!) This used to be important for the
|
1165 |
* subRE tree, although the important bits are now handled by the in-line
|
1166 |
* code in parse(), and when this is called, it doesn't matter any more.
|
1167 |
^ static VOID repeat(struct vars *, struct state *, struct state *, int, int);
|
1168 |
*/
|
1169 |
static VOID
|
1170 |
repeat(v, lp, rp, m, n)
|
1171 |
struct vars *v;
|
1172 |
struct state *lp;
|
1173 |
struct state *rp;
|
1174 |
int m;
|
1175 |
int n;
|
1176 |
{
|
1177 |
# define SOME 2
|
1178 |
# define INF 3
|
1179 |
# define PAIR(x, y) ((x)*4 + (y))
|
1180 |
# define REDUCE(x) ( ((x) == INFINITY) ? INF : (((x) > 1) ? SOME : (x)) )
|
1181 |
CONST int rm = REDUCE(m);
|
1182 |
CONST int rn = REDUCE(n);
|
1183 |
struct state *s;
|
1184 |
struct state *s2;
|
1185 |
|
1186 |
switch (PAIR(rm, rn)) {
|
1187 |
case PAIR(0, 0): /* empty string */
|
1188 |
delsub(v->nfa, lp, rp);
|
1189 |
EMPTYARC(lp, rp);
|
1190 |
break;
|
1191 |
case PAIR(0, 1): /* do as x| */
|
1192 |
EMPTYARC(lp, rp);
|
1193 |
break;
|
1194 |
case PAIR(0, SOME): /* do as x{1,n}| */
|
1195 |
repeat(v, lp, rp, 1, n);
|
1196 |
NOERR();
|
1197 |
EMPTYARC(lp, rp);
|
1198 |
break;
|
1199 |
case PAIR(0, INF): /* loop x around */
|
1200 |
s = newstate(v->nfa);
|
1201 |
NOERR();
|
1202 |
moveouts(v->nfa, lp, s);
|
1203 |
moveins(v->nfa, rp, s);
|
1204 |
EMPTYARC(lp, s);
|
1205 |
EMPTYARC(s, rp);
|
1206 |
break;
|
1207 |
case PAIR(1, 1): /* no action required */
|
1208 |
break;
|
1209 |
case PAIR(1, SOME): /* do as x{0,n-1}x = (x{1,n-1}|)x */
|
1210 |
s = newstate(v->nfa);
|
1211 |
NOERR();
|
1212 |
moveouts(v->nfa, lp, s);
|
1213 |
dupnfa(v->nfa, s, rp, lp, s);
|
1214 |
NOERR();
|
1215 |
repeat(v, lp, s, 1, n-1);
|
1216 |
NOERR();
|
1217 |
EMPTYARC(lp, s);
|
1218 |
break;
|
1219 |
case PAIR(1, INF): /* add loopback arc */
|
1220 |
s = newstate(v->nfa);
|
1221 |
s2 = newstate(v->nfa);
|
1222 |
NOERR();
|
1223 |
moveouts(v->nfa, lp, s);
|
1224 |
moveins(v->nfa, rp, s2);
|
1225 |
EMPTYARC(lp, s);
|
1226 |
EMPTYARC(s2, rp);
|
1227 |
EMPTYARC(s2, s);
|
1228 |
break;
|
1229 |
case PAIR(SOME, SOME): /* do as x{m-1,n-1}x */
|
1230 |
s = newstate(v->nfa);
|
1231 |
NOERR();
|
1232 |
moveouts(v->nfa, lp, s);
|
1233 |
dupnfa(v->nfa, s, rp, lp, s);
|
1234 |
NOERR();
|
1235 |
repeat(v, lp, s, m-1, n-1);
|
1236 |
break;
|
1237 |
case PAIR(SOME, INF): /* do as x{m-1,}x */
|
1238 |
s = newstate(v->nfa);
|
1239 |
NOERR();
|
1240 |
moveouts(v->nfa, lp, s);
|
1241 |
dupnfa(v->nfa, s, rp, lp, s);
|
1242 |
NOERR();
|
1243 |
repeat(v, lp, s, m-1, n);
|
1244 |
break;
|
1245 |
default:
|
1246 |
ERR(REG_ASSERT);
|
1247 |
break;
|
1248 |
}
|
1249 |
}
|
1250 |
|
1251 |
/*
|
1252 |
- bracket - handle non-complemented bracket expression
|
1253 |
* Also called from cbracket for complemented bracket expressions.
|
1254 |
^ static VOID bracket(struct vars *, struct state *, struct state *);
|
1255 |
*/
|
1256 |
static VOID
|
1257 |
bracket(v, lp, rp)
|
1258 |
struct vars *v;
|
1259 |
struct state *lp;
|
1260 |
struct state *rp;
|
1261 |
{
|
1262 |
assert(SEE('['));
|
1263 |
NEXT();
|
1264 |
while (!SEE(']') && !SEE(EOS))
|
1265 |
brackpart(v, lp, rp);
|
1266 |
assert(SEE(']') || ISERR());
|
1267 |
okcolors(v->nfa, v->cm);
|
1268 |
}
|
1269 |
|
1270 |
/*
|
1271 |
- cbracket - handle complemented bracket expression
|
1272 |
* We do it by calling bracket() with dummy endpoints, and then complementing
|
1273 |
* the result. The alternative would be to invoke rainbow(), and then delete
|
1274 |
* arcs as the b.e. is seen... but that gets messy.
|
1275 |
^ static VOID cbracket(struct vars *, struct state *, struct state *);
|
1276 |
*/
|
1277 |
static VOID
|
1278 |
cbracket(v, lp, rp)
|
1279 |
struct vars *v;
|
1280 |
struct state *lp;
|
1281 |
struct state *rp;
|
1282 |
{
|
1283 |
struct state *left = newstate(v->nfa);
|
1284 |
struct state *right = newstate(v->nfa);
|
1285 |
struct state *s;
|
1286 |
struct arc *a; /* arc from lp */
|
1287 |
struct arc *ba; /* arc from left, from bracket() */
|
1288 |
struct arc *pa; /* MCCE-prototype arc */
|
1289 |
color co;
|
1290 |
chr *p;
|
1291 |
int i;
|
1292 |
|
1293 |
NOERR();
|
1294 |
bracket(v, left, right);
|
1295 |
if (v->cflags®_NLSTOP)
|
1296 |
newarc(v->nfa, PLAIN, v->nlcolor, left, right);
|
1297 |
NOERR();
|
1298 |
|
1299 |
assert(lp->nouts == 0); /* all outarcs will be ours */
|
1300 |
|
1301 |
/* easy part of complementing */
|
1302 |
colorcomplement(v->nfa, v->cm, PLAIN, left, lp, rp);
|
1303 |
NOERR();
|
1304 |
if (v->mcces == NULL) { /* no MCCEs -- we're done */
|
1305 |
dropstate(v->nfa, left);
|
1306 |
assert(right->nins == 0);
|
1307 |
freestate(v->nfa, right);
|
1308 |
return;
|
1309 |
}
|
1310 |
|
1311 |
/* but complementing gets messy in the presence of MCCEs... */
|
1312 |
NOTE(REG_ULOCALE);
|
1313 |
for (p = v->mcces->chrs, i = v->mcces->nchrs; i > 0; p++, i--) {
|
1314 |
co = GETCOLOR(v->cm, *p);
|
1315 |
a = findarc(lp, PLAIN, co);
|
1316 |
ba = findarc(left, PLAIN, co);
|
1317 |
if (ba == NULL) {
|
1318 |
assert(a != NULL);
|
1319 |
freearc(v->nfa, a);
|
1320 |
} else {
|
1321 |
assert(a == NULL);
|
1322 |
}
|
1323 |
s = newstate(v->nfa);
|
1324 |
NOERR();
|
1325 |
newarc(v->nfa, PLAIN, co, lp, s);
|
1326 |
NOERR();
|
1327 |
pa = findarc(v->mccepbegin, PLAIN, co);
|
1328 |
assert(pa != NULL);
|
1329 |
if (ba == NULL) { /* easy case, need all of them */
|
1330 |
cloneouts(v->nfa, pa->to, s, rp, PLAIN);
|
1331 |
newarc(v->nfa, '$', 1, s, rp);
|
1332 |
newarc(v->nfa, '$', 0, s, rp);
|
1333 |
colorcomplement(v->nfa, v->cm, AHEAD, pa->to, s, rp);
|
1334 |
} else { /* must be selective */
|
1335 |
if (findarc(ba->to, '$', 1) == NULL) {
|
1336 |
newarc(v->nfa, '$', 1, s, rp);
|
1337 |
newarc(v->nfa, '$', 0, s, rp);
|
1338 |
colorcomplement(v->nfa, v->cm, AHEAD, pa->to,
|
1339 |
s, rp);
|
1340 |
}
|
1341 |
for (pa = pa->to->outs; pa != NULL; pa = pa->outchain)
|
1342 |
if (findarc(ba->to, PLAIN, pa->co) == NULL)
|
1343 |
newarc(v->nfa, PLAIN, pa->co, s, rp);
|
1344 |
if (s->nouts == 0) /* limit of selectivity: none */
|
1345 |
dropstate(v->nfa, s); /* frees arc too */
|
1346 |
}
|
1347 |
NOERR();
|
1348 |
}
|
1349 |
|
1350 |
delsub(v->nfa, left, right);
|
1351 |
assert(left->nouts == 0);
|
1352 |
freestate(v->nfa, left);
|
1353 |
assert(right->nins == 0);
|
1354 |
freestate(v->nfa, right);
|
1355 |
}
|
1356 |
|
1357 |
/*
|
1358 |
- brackpart - handle one item (or range) within a bracket expression
|
1359 |
^ static VOID brackpart(struct vars *, struct state *, struct state *);
|
1360 |
*/
|
1361 |
static VOID
|
1362 |
brackpart(v, lp, rp)
|
1363 |
struct vars *v;
|
1364 |
struct state *lp;
|
1365 |
struct state *rp;
|
1366 |
{
|
1367 |
celt startc;
|
1368 |
celt endc;
|
1369 |
struct cvec *cv;
|
1370 |
chr *startp;
|
1371 |
chr *endp;
|
1372 |
chr c[1];
|
1373 |
|
1374 |
/* parse something, get rid of special cases, take shortcuts */
|
1375 |
switch (v->nexttype) {
|
1376 |
case RANGE: /* a-b-c or other botch */
|
1377 |
ERR(REG_ERANGE);
|
1378 |
return;
|
1379 |
break;
|
1380 |
case PLAIN:
|
1381 |
c[0] = v->nextvalue;
|
1382 |
NEXT();
|
1383 |
/* shortcut for ordinary chr (not range, not MCCE leader) */
|
1384 |
if (!SEE(RANGE) && !ISCELEADER(v, c[0])) {
|
1385 |
onechr(v, c[0], lp, rp);
|
1386 |
return;
|
1387 |
}
|
1388 |
startc = element(v, c, c+1);
|
1389 |
NOERR();
|
1390 |
break;
|
1391 |
case COLLEL:
|
1392 |
startp = v->now;
|
1393 |
endp = scanplain(v);
|
1394 |
INSIST(startp < endp, REG_ECOLLATE);
|
1395 |
NOERR();
|
1396 |
startc = element(v, startp, endp);
|
1397 |
NOERR();
|
1398 |
break;
|
1399 |
case ECLASS:
|
1400 |
startp = v->now;
|
1401 |
endp = scanplain(v);
|
1402 |
INSIST(startp < endp, REG_ECOLLATE);
|
1403 |
NOERR();
|
1404 |
startc = element(v, startp, endp);
|
1405 |
NOERR();
|
1406 |
cv = eclass(v, startc, (v->cflags®_ICASE));
|
1407 |
NOERR();
|
1408 |
dovec(v, cv, lp, rp);
|
1409 |
return;
|
1410 |
break;
|
1411 |
case CCLASS:
|
1412 |
startp = v->now;
|
1413 |
endp = scanplain(v);
|
1414 |
INSIST(startp < endp, REG_ECTYPE);
|
1415 |
NOERR();
|
1416 |
cv = cclass(v, startp, endp, (v->cflags®_ICASE));
|
1417 |
NOERR();
|
1418 |
dovec(v, cv, lp, rp);
|
1419 |
return;
|
1420 |
break;
|
1421 |
default:
|
1422 |
ERR(REG_ASSERT);
|
1423 |
return;
|
1424 |
break;
|
1425 |
}
|
1426 |
|
1427 |
if (SEE(RANGE)) {
|
1428 |
NEXT();
|
1429 |
switch (v->nexttype) {
|
1430 |
case PLAIN:
|
1431 |
case RANGE:
|
1432 |
c[0] = v->nextvalue;
|
1433 |
NEXT();
|
1434 |
endc = element(v, c, c+1);
|
1435 |
NOERR();
|
1436 |
break;
|
1437 |
case COLLEL:
|
1438 |
startp = v->now;
|
1439 |
endp = scanplain(v);
|
1440 |
INSIST(startp < endp, REG_ECOLLATE);
|
1441 |
NOERR();
|
1442 |
endc = element(v, startp, endp);
|
1443 |
NOERR();
|
1444 |
break;
|
1445 |
default:
|
1446 |
ERR(REG_ERANGE);
|
1447 |
return;
|
1448 |
break;
|
1449 |
}
|
1450 |
} else
|
1451 |
endc = startc;
|
1452 |
|
1453 |
/*
|
1454 |
* Ranges are unportable. Actually, standard C does
|
1455 |
* guarantee that digits are contiguous, but making
|
1456 |
* that an exception is just too complicated.
|
1457 |
*/
|
1458 |
if (startc != endc)
|
1459 |
NOTE(REG_UUNPORT);
|
1460 |
cv = range(v, startc, endc, (v->cflags®_ICASE));
|
1461 |
NOERR();
|
1462 |
dovec(v, cv, lp, rp);
|
1463 |
}
|
1464 |
|
1465 |
/*
|
1466 |
- scanplain - scan PLAIN contents of [. etc.
|
1467 |
* Certain bits of trickery in lex.c know that this code does not try
|
1468 |
* to look past the final bracket of the [. etc.
|
1469 |
^ static chr *scanplain(struct vars *);
|
1470 |
*/
|
1471 |
static chr * /* just after end of sequence */
|
1472 |
scanplain(v)
|
1473 |
struct vars *v;
|
1474 |
{
|
1475 |
chr *endp;
|
1476 |
|
1477 |
assert(SEE(COLLEL) || SEE(ECLASS) || SEE(CCLASS));
|
1478 |
NEXT();
|
1479 |
|
1480 |
endp = v->now;
|
1481 |
while (SEE(PLAIN)) {
|
1482 |
endp = v->now;
|
1483 |
NEXT();
|
1484 |
}
|
1485 |
|
1486 |
assert(SEE(END) || ISERR());
|
1487 |
NEXT();
|
1488 |
|
1489 |
return endp;
|
1490 |
}
|
1491 |
|
1492 |
/*
|
1493 |
- leaders - process a cvec of collating elements to also include leaders
|
1494 |
* Also gives all characters involved their own colors, which is almost
|
1495 |
* certainly necessary, and sets up little disconnected subNFA.
|
1496 |
^ static VOID leaders(struct vars *, struct cvec *);
|
1497 |
*/
|
1498 |
static VOID
|
1499 |
leaders(v, cv)
|
1500 |
struct vars *v;
|
1501 |
struct cvec *cv;
|
1502 |
{
|
1503 |
int mcce;
|
1504 |
chr *p;
|
1505 |
chr leader;
|
1506 |
struct state *s;
|
1507 |
struct arc *a;
|
1508 |
|
1509 |
v->mccepbegin = newstate(v->nfa);
|
1510 |
v->mccepend = newstate(v->nfa);
|
1511 |
NOERR();
|
1512 |
|
1513 |
for (mcce = 0; mcce < cv->nmcces; mcce++) {
|
1514 |
p = cv->mcces[mcce];
|
1515 |
leader = *p;
|
1516 |
if (!haschr(cv, leader)) {
|
1517 |
addchr(cv, leader);
|
1518 |
s = newstate(v->nfa);
|
1519 |
newarc(v->nfa, PLAIN, subcolor(v->cm, leader),
|
1520 |
v->mccepbegin, s);
|
1521 |
okcolors(v->nfa, v->cm);
|
1522 |
} else {
|
1523 |
a = findarc(v->mccepbegin, PLAIN,
|
1524 |
GETCOLOR(v->cm, leader));
|
1525 |
assert(a != NULL);
|
1526 |
s = a->to;
|
1527 |
assert(s != v->mccepend);
|
1528 |
}
|
1529 |
p++;
|
1530 |
assert(*p != 0 && *(p+1) == 0); /* only 2-char MCCEs for now */
|
1531 |
newarc(v->nfa, PLAIN, subcolor(v->cm, *p), s, v->mccepend);
|
1532 |
okcolors(v->nfa, v->cm);
|
1533 |
}
|
1534 |
}
|
1535 |
|
1536 |
/*
|
1537 |
- onechr - fill in arcs for a plain character, and possible case complements
|
1538 |
* This is mostly a shortcut for efficient handling of the common case.
|
1539 |
^ static VOID onechr(struct vars *, pchr, struct state *, struct state *);
|
1540 |
*/
|
1541 |
static VOID
|
1542 |
onechr(v, c, lp, rp)
|
1543 |
struct vars *v;
|
1544 |
pchr c;
|
1545 |
struct state *lp;
|
1546 |
struct state *rp;
|
1547 |
{
|
1548 |
if (!(v->cflags®_ICASE)) {
|
1549 |
newarc(v->nfa, PLAIN, subcolor(v->cm, c), lp, rp);
|
1550 |
return;
|
1551 |
}
|
1552 |
|
1553 |
/* rats, need general case anyway... */
|
1554 |
dovec(v, allcases(v, c), lp, rp);
|
1555 |
}
|
1556 |
|
1557 |
/*
|
1558 |
- dovec - fill in arcs for each element of a cvec
|
1559 |
* This one has to handle the messy cases, like MCCEs and MCCE leaders.
|
1560 |
^ static VOID dovec(struct vars *, struct cvec *, struct state *,
|
1561 |
^ struct state *);
|
1562 |
*/
|
1563 |
static VOID
|
1564 |
dovec(v, cv, lp, rp)
|
1565 |
struct vars *v;
|
1566 |
struct cvec *cv;
|
1567 |
struct state *lp;
|
1568 |
struct state *rp;
|
1569 |
{
|
1570 |
chr ch, from, to;
|
1571 |
celt ce;
|
1572 |
chr *p;
|
1573 |
int i;
|
1574 |
color co;
|
1575 |
struct cvec *leads;
|
1576 |
struct arc *a;
|
1577 |
struct arc *pa; /* arc in prototype */
|
1578 |
struct state *s;
|
1579 |
struct state *ps; /* state in prototype */
|
1580 |
|
1581 |
/* need a place to store leaders, if any */
|
1582 |
if (nmcces(v) > 0) {
|
1583 |
assert(v->mcces != NULL);
|
1584 |
if (v->cv2 == NULL || v->cv2->nchrs < v->mcces->nchrs) {
|
1585 |
if (v->cv2 != NULL)
|
1586 |
free(v->cv2);
|
1587 |
v->cv2 = newcvec(v->mcces->nchrs, 0, v->mcces->nmcces);
|
1588 |
NOERR();
|
1589 |
leads = v->cv2;
|
1590 |
} else
|
1591 |
leads = clearcvec(v->cv2);
|
1592 |
} else
|
1593 |
leads = NULL;
|
1594 |
|
1595 |
/* first, get the ordinary characters out of the way */
|
1596 |
for (p = cv->chrs, i = cv->nchrs; i > 0; p++, i--) {
|
1597 |
ch = *p;
|
1598 |
if (!ISCELEADER(v, ch))
|
1599 |
newarc(v->nfa, PLAIN, subcolor(v->cm, ch), lp, rp);
|
1600 |
else {
|
1601 |
assert(singleton(v->cm, ch));
|
1602 |
assert(leads != NULL);
|
1603 |
if (!haschr(leads, ch))
|
1604 |
addchr(leads, ch);
|
1605 |
}
|
1606 |
}
|
1607 |
|
1608 |
/* and the ranges */
|
1609 |
for (p = cv->ranges, i = cv->nranges; i > 0; p += 2, i--) {
|
1610 |
from = *p;
|
1611 |
to = *(p+1);
|
1612 |
while (from <= to && (ce = nextleader(v, from, to)) != NOCELT) {
|
1613 |
if (from < ce)
|
1614 |
subrange(v, from, ce - 1, lp, rp);
|
1615 |
assert(singleton(v->cm, ce));
|
1616 |
assert(leads != NULL);
|
1617 |
if (!haschr(leads, ce))
|
1618 |
addchr(leads, ce);
|
1619 |
from = ce + 1;
|
1620 |
}
|
1621 |
if (from <= to)
|
1622 |
subrange(v, from, to, lp, rp);
|
1623 |
}
|
1624 |
|
1625 |
if ((leads == NULL || leads->nchrs == 0) && cv->nmcces == 0)
|
1626 |
return;
|
1627 |
|
1628 |
/* deal with the MCCE leaders */
|
1629 |
NOTE(REG_ULOCALE);
|
1630 |
for (p = leads->chrs, i = leads->nchrs; i > 0; p++, i--) {
|
1631 |
co = GETCOLOR(v->cm, *p);
|
1632 |
a = findarc(lp, PLAIN, co);
|
1633 |
if (a != NULL)
|
1634 |
s = a->to;
|
1635 |
else {
|
1636 |
s = newstate(v->nfa);
|
1637 |
NOERR();
|
1638 |
newarc(v->nfa, PLAIN, co, lp, s);
|
1639 |
NOERR();
|
1640 |
}
|
1641 |
pa = findarc(v->mccepbegin, PLAIN, co);
|
1642 |
assert(pa != NULL);
|
1643 |
ps = pa->to;
|
1644 |
newarc(v->nfa, '$', 1, s, rp);
|
1645 |
newarc(v->nfa, '$', 0, s, rp);
|
1646 |
colorcomplement(v->nfa, v->cm, AHEAD, ps, s, rp);
|
1647 |
NOERR();
|
1648 |
}
|
1649 |
|
1650 |
/* and the MCCEs */
|
1651 |
for (i = 0; i < cv->nmcces; i++) {
|
1652 |
p = cv->mcces[i];
|
1653 |
assert(singleton(v->cm, *p));
|
1654 |
if (!singleton(v->cm, *p)) {
|
1655 |
ERR(REG_ASSERT);
|
1656 |
return;
|
1657 |
}
|
1658 |
ch = *p++;
|
1659 |
co = GETCOLOR(v->cm, ch);
|
1660 |
a = findarc(lp, PLAIN, co);
|
1661 |
if (a != NULL)
|
1662 |
s = a->to;
|
1663 |
else {
|
1664 |
s = newstate(v->nfa);
|
1665 |
NOERR();
|
1666 |
newarc(v->nfa, PLAIN, co, lp, s);
|
1667 |
NOERR();
|
1668 |
}
|
1669 |
assert(*p != 0); /* at least two chars */
|
1670 |
assert(singleton(v->cm, *p));
|
1671 |
ch = *p++;
|
1672 |
co = GETCOLOR(v->cm, ch);
|
1673 |
assert(*p == 0); /* and only two, for now */
|
1674 |
newarc(v->nfa, PLAIN, co, s, rp);
|
1675 |
NOERR();
|
1676 |
}
|
1677 |
}
|
1678 |
|
1679 |
/*
|
1680 |
- nextleader - find next MCCE leader within range
|
1681 |
^ static celt nextleader(struct vars *, pchr, pchr);
|
1682 |
*/
|
1683 |
static celt /* NOCELT means none */
|
1684 |
nextleader(v, from, to)
|
1685 |
struct vars *v;
|
1686 |
pchr from;
|
1687 |
pchr to;
|
1688 |
{
|
1689 |
int i;
|
1690 |
chr *p;
|
1691 |
chr ch;
|
1692 |
celt it = NOCELT;
|
1693 |
|
1694 |
if (v->mcces == NULL)
|
1695 |
return it;
|
1696 |
|
1697 |
for (i = v->mcces->nchrs, p = v->mcces->chrs; i > 0; i--, p++) {
|
1698 |
ch = *p;
|
1699 |
if (from <= ch && ch <= to)
|
1700 |
if (it == NOCELT || ch < it)
|
1701 |
it = ch;
|
1702 |
}
|
1703 |
return it;
|
1704 |
}
|
1705 |
|
1706 |
/*
|
1707 |
- wordchrs - set up word-chr list for word-boundary stuff, if needed
|
1708 |
* The list is kept as a bunch of arcs between two dummy states; it's
|
1709 |
* disposed of by the unreachable-states sweep in NFA optimization.
|
1710 |
* Does NEXT(). Must not be called from any unusual lexical context.
|
1711 |
* This should be reconciled with the \w etc. handling in lex.c, and
|
1712 |
* should be cleaned up to reduce dependencies on input scanning.
|
1713 |
^ static VOID wordchrs(struct vars *);
|
1714 |
*/
|
1715 |
static VOID
|
1716 |
wordchrs(v)
|
1717 |
struct vars *v;
|
1718 |
{
|
1719 |
struct state *left;
|
1720 |
struct state *right;
|
1721 |
|
1722 |
if (v->wordchrs != NULL) {
|
1723 |
NEXT(); /* for consistency */
|
1724 |
return;
|
1725 |
}
|
1726 |
|
1727 |
left = newstate(v->nfa);
|
1728 |
right = newstate(v->nfa);
|
1729 |
NOERR();
|
1730 |
/* fine point: implemented with [::], and lexer will set REG_ULOCALE */
|
1731 |
lexword(v);
|
1732 |
NEXT();
|
1733 |
assert(v->savenow != NULL && SEE('['));
|
1734 |
bracket(v, left, right);
|
1735 |
assert((v->savenow != NULL && SEE(']')) || ISERR());
|
1736 |
NEXT();
|
1737 |
NOERR();
|
1738 |
v->wordchrs = left;
|
1739 |
}
|
1740 |
|
1741 |
/*
|
1742 |
- subre - allocate a subre
|
1743 |
^ static struct subre *subre(struct vars *, int, int, struct state *,
|
1744 |
^ struct state *);
|
1745 |
*/
|
1746 |
static struct subre *
|
1747 |
subre(v, op, flags, begin, end)
|
1748 |
struct vars *v;
|
1749 |
int op;
|
1750 |
int flags;
|
1751 |
struct state *begin;
|
1752 |
struct state *end;
|
1753 |
{
|
1754 |
struct subre *ret;
|
1755 |
|
1756 |
ret = v->treefree;
|
1757 |
if (ret != NULL)
|
1758 |
v->treefree = ret->left;
|
1759 |
else {
|
1760 |
ret = (struct subre *)MALLOC(sizeof(struct subre));
|
1761 |
if (ret == NULL) {
|
1762 |
ERR(REG_ESPACE);
|
1763 |
return NULL;
|
1764 |
}
|
1765 |
ret->chain = v->treechain;
|
1766 |
v->treechain = ret;
|
1767 |
}
|
1768 |
|
1769 |
assert(strchr("|.b(=", op) != NULL);
|
1770 |
|
1771 |
ret->op = op;
|
1772 |
ret->flags = flags;
|
1773 |
ret->retry = 0;
|
1774 |
ret->subno = 0;
|
1775 |
ret->min = ret->max = 1;
|
1776 |
ret->left = NULL;
|
1777 |
ret->right = NULL;
|
1778 |
ret->begin = begin;
|
1779 |
ret->end = end;
|
1780 |
ZAPCNFA(ret->cnfa);
|
1781 |
|
1782 |
return ret;
|
1783 |
}
|
1784 |
|
1785 |
/*
|
1786 |
- freesubre - free a subRE subtree
|
1787 |
^ static VOID freesubre(struct vars *, struct subre *);
|
1788 |
*/
|
1789 |
static VOID
|
1790 |
freesubre(v, sr)
|
1791 |
struct vars *v; /* might be NULL */
|
1792 |
struct subre *sr;
|
1793 |
{
|
1794 |
if (sr == NULL)
|
1795 |
return;
|
1796 |
|
1797 |
if (sr->left != NULL)
|
1798 |
freesubre(v, sr->left);
|
1799 |
if (sr->right != NULL)
|
1800 |
freesubre(v, sr->right);
|
1801 |
|
1802 |
freesrnode(v, sr);
|
1803 |
}
|
1804 |
|
1805 |
/*
|
1806 |
- freesrnode - free one node in a subRE subtree
|
1807 |
^ static VOID freesrnode(struct vars *, struct subre *);
|
1808 |
*/
|
1809 |
static VOID
|
1810 |
freesrnode(v, sr)
|
1811 |
struct vars *v; /* might be NULL */
|
1812 |
struct subre *sr;
|
1813 |
{
|
1814 |
if (sr == NULL)
|
1815 |
return;
|
1816 |
|
1817 |
if (!NULLCNFA(sr->cnfa))
|
1818 |
freecnfa(&sr->cnfa);
|
1819 |
sr->flags = 0;
|
1820 |
|
1821 |
if (v != NULL) {
|
1822 |
sr->left = v->treefree;
|
1823 |
v->treefree = sr;
|
1824 |
} else
|
1825 |
FREE(sr);
|
1826 |
}
|
1827 |
|
1828 |
/*
|
1829 |
- optst - optimize a subRE subtree
|
1830 |
^ static VOID optst(struct vars *, struct subre *);
|
1831 |
*/
|
1832 |
static VOID
|
1833 |
optst(v, t)
|
1834 |
struct vars *v;
|
1835 |
struct subre *t;
|
1836 |
{
|
1837 |
if (t == NULL)
|
1838 |
return;
|
1839 |
|
1840 |
/* recurse through children */
|
1841 |
if (t->left != NULL)
|
1842 |
optst(v, t->left);
|
1843 |
if (t->right != NULL)
|
1844 |
optst(v, t->right);
|
1845 |
}
|
1846 |
|
1847 |
/*
|
1848 |
- numst - number tree nodes (assigning retry indexes)
|
1849 |
^ static int numst(struct subre *, int);
|
1850 |
*/
|
1851 |
static int /* next number */
|
1852 |
numst(t, start)
|
1853 |
struct subre *t;
|
1854 |
int start; /* starting point for subtree numbers */
|
1855 |
{
|
1856 |
int i;
|
1857 |
|
1858 |
assert(t != NULL);
|
1859 |
|
1860 |
i = start;
|
1861 |
t->retry = (short)i++;
|
1862 |
if (t->left != NULL)
|
1863 |
i = numst(t->left, i);
|
1864 |
if (t->right != NULL)
|
1865 |
i = numst(t->right, i);
|
1866 |
return i;
|
1867 |
}
|
1868 |
|
1869 |
/*
|
1870 |
- markst - mark tree nodes as INUSE
|
1871 |
^ static VOID markst(struct subre *);
|
1872 |
*/
|
1873 |
static VOID
|
1874 |
markst(t)
|
1875 |
struct subre *t;
|
1876 |
{
|
1877 |
assert(t != NULL);
|
1878 |
|
1879 |
t->flags |= INUSE;
|
1880 |
if (t->left != NULL)
|
1881 |
markst(t->left);
|
1882 |
if (t->right != NULL)
|
1883 |
markst(t->right);
|
1884 |
}
|
1885 |
|
1886 |
/*
|
1887 |
- cleanst - free any tree nodes not marked INUSE
|
1888 |
^ static VOID cleanst(struct vars *);
|
1889 |
*/
|
1890 |
static VOID
|
1891 |
cleanst(v)
|
1892 |
struct vars *v;
|
1893 |
{
|
1894 |
struct subre *t;
|
1895 |
struct subre *next;
|
1896 |
|
1897 |
for (t = v->treechain; t != NULL; t = next) {
|
1898 |
next = t->chain;
|
1899 |
if (!(t->flags&INUSE))
|
1900 |
FREE(t);
|
1901 |
}
|
1902 |
v->treechain = NULL;
|
1903 |
v->treefree = NULL; /* just on general principles */
|
1904 |
}
|
1905 |
|
1906 |
/*
|
1907 |
- nfatree - turn a subRE subtree into a tree of compacted NFAs
|
1908 |
^ static long nfatree(struct vars *, struct subre *, FILE *);
|
1909 |
*/
|
1910 |
static long /* optimize results from top node */
|
1911 |
nfatree(v, t, f)
|
1912 |
struct vars *v;
|
1913 |
struct subre *t;
|
1914 |
FILE *f; /* for debug output */
|
1915 |
{
|
1916 |
assert(t != NULL && t->begin != NULL);
|
1917 |
|
1918 |
if (t->left != NULL)
|
1919 |
(DISCARD)nfatree(v, t->left, f);
|
1920 |
if (t->right != NULL)
|
1921 |
(DISCARD)nfatree(v, t->right, f);
|
1922 |
|
1923 |
return nfanode(v, t, f);
|
1924 |
}
|
1925 |
|
1926 |
/*
|
1927 |
- nfanode - do one NFA for nfatree
|
1928 |
^ static long nfanode(struct vars *, struct subre *, FILE *);
|
1929 |
*/
|
1930 |
static long /* optimize results */
|
1931 |
nfanode(v, t, f)
|
1932 |
struct vars *v;
|
1933 |
struct subre *t;
|
1934 |
FILE *f; /* for debug output */
|
1935 |
{
|
1936 |
struct nfa *nfa;
|
1937 |
long ret = 0;
|
1938 |
char idbuf[50];
|
1939 |
|
1940 |
assert(t->begin != NULL);
|
1941 |
|
1942 |
if (f != NULL)
|
1943 |
fprintf(f, "\n\n\n========= TREE NODE %s ==========\n",
|
1944 |
stid(t, idbuf, sizeof(idbuf)));
|
1945 |
nfa = newnfa(v, v->cm, v->nfa);
|
1946 |
NOERRZ();
|
1947 |
dupnfa(nfa, t->begin, t->end, nfa->init, nfa->final);
|
1948 |
if (!ISERR()) {
|
1949 |
specialcolors(nfa);
|
1950 |
ret = optimize(nfa, f);
|
1951 |
}
|
1952 |
if (!ISERR())
|
1953 |
compact(nfa, &t->cnfa);
|
1954 |
|
1955 |
freenfa(nfa);
|
1956 |
return ret;
|
1957 |
}
|
1958 |
|
1959 |
/*
|
1960 |
- newlacon - allocate a lookahead-constraint subRE
|
1961 |
^ static int newlacon(struct vars *, struct state *, struct state *, int);
|
1962 |
*/
|
1963 |
static int /* lacon number */
|
1964 |
newlacon(v, begin, end, pos)
|
1965 |
struct vars *v;
|
1966 |
struct state *begin;
|
1967 |
struct state *end;
|
1968 |
int pos;
|
1969 |
{
|
1970 |
int n;
|
1971 |
struct subre *sub;
|
1972 |
|
1973 |
if (v->nlacons == 0) {
|
1974 |
v->lacons = (struct subre *)MALLOC(2 * sizeof(struct subre));
|
1975 |
n = 1; /* skip 0th */
|
1976 |
v->nlacons = 2;
|
1977 |
} else {
|
1978 |
v->lacons = (struct subre *)REALLOC(v->lacons,
|
1979 |
(v->nlacons+1)*sizeof(struct subre));
|
1980 |
n = v->nlacons++;
|
1981 |
}
|
1982 |
if (v->lacons == NULL) {
|
1983 |
ERR(REG_ESPACE);
|
1984 |
return 0;
|
1985 |
}
|
1986 |
sub = &v->lacons[n];
|
1987 |
sub->begin = begin;
|
1988 |
sub->end = end;
|
1989 |
sub->subno = pos;
|
1990 |
ZAPCNFA(sub->cnfa);
|
1991 |
return n;
|
1992 |
}
|
1993 |
|
1994 |
/*
|
1995 |
- freelacons - free lookahead-constraint subRE vector
|
1996 |
^ static VOID freelacons(struct subre *, int);
|
1997 |
*/
|
1998 |
static VOID
|
1999 |
freelacons(subs, n)
|
2000 |
struct subre *subs;
|
2001 |
int n;
|
2002 |
{
|
2003 |
struct subre *sub;
|
2004 |
int i;
|
2005 |
|
2006 |
assert(n > 0);
|
2007 |
for (sub = subs + 1, i = n - 1; i > 0; sub++, i--) /* no 0th */
|
2008 |
if (!NULLCNFA(sub->cnfa))
|
2009 |
freecnfa(&sub->cnfa);
|
2010 |
FREE(subs);
|
2011 |
}
|
2012 |
|
2013 |
/*
|
2014 |
- rfree - free a whole RE (insides of regfree)
|
2015 |
^ static VOID rfree(regex_t *);
|
2016 |
*/
|
2017 |
static VOID
|
2018 |
rfree(re)
|
2019 |
regex_t *re;
|
2020 |
{
|
2021 |
struct guts *g;
|
2022 |
|
2023 |
if (re == NULL || re->re_magic != REMAGIC)
|
2024 |
return;
|
2025 |
|
2026 |
re->re_magic = 0; /* invalidate RE */
|
2027 |
g = (struct guts *)re->re_guts;
|
2028 |
re->re_guts = NULL;
|
2029 |
re->re_fns = NULL;
|
2030 |
g->magic = 0;
|
2031 |
freecm(&g->cmap);
|
2032 |
if (g->tree != NULL)
|
2033 |
freesubre((struct vars *)NULL, g->tree);
|
2034 |
if (g->lacons != NULL)
|
2035 |
freelacons(g->lacons, g->nlacons);
|
2036 |
if (!NULLCNFA(g->search))
|
2037 |
freecnfa(&g->search);
|
2038 |
FREE(g);
|
2039 |
}
|
2040 |
|
2041 |
/*
|
2042 |
- dump - dump an RE in human-readable form
|
2043 |
^ static VOID dump(regex_t *, FILE *);
|
2044 |
*/
|
2045 |
static VOID
|
2046 |
dump(re, f)
|
2047 |
regex_t *re;
|
2048 |
FILE *f;
|
2049 |
{
|
2050 |
#ifdef REG_DEBUG
|
2051 |
struct guts *g;
|
2052 |
int i;
|
2053 |
|
2054 |
if (re->re_magic != REMAGIC)
|
2055 |
fprintf(f, "bad magic number (0x%x not 0x%x)\n", re->re_magic,
|
2056 |
REMAGIC);
|
2057 |
if (re->re_guts == NULL) {
|
2058 |
fprintf(f, "NULL guts!!!\n");
|
2059 |
return;
|
2060 |
}
|
2061 |
g = (struct guts *)re->re_guts;
|
2062 |
if (g->magic != GUTSMAGIC)
|
2063 |
fprintf(f, "bad guts magic number (0x%x not 0x%x)\n", g->magic,
|
2064 |
GUTSMAGIC);
|
2065 |
|
2066 |
fprintf(f, "\n\n\n========= DUMP ==========\n");
|
2067 |
fprintf(f, "nsub %d, info 0%lo, csize %d, ntree %d\n",
|
2068 |
re->re_nsub, re->re_info, re->re_csize, g->ntree);
|
2069 |
|
2070 |
dumpcolors(&g->cmap, f);
|
2071 |
if (!NULLCNFA(g->search)) {
|
2072 |
printf("\nsearch:\n");
|
2073 |
dumpcnfa(&g->search, f);
|
2074 |
}
|
2075 |
for (i = 1; i < g->nlacons; i++) {
|
2076 |
fprintf(f, "\nla%d (%s):\n", i,
|
2077 |
(g->lacons[i].subno) ? "positive" : "negative");
|
2078 |
dumpcnfa(&g->lacons[i].cnfa, f);
|
2079 |
}
|
2080 |
fprintf(f, "\n");
|
2081 |
dumpst(g->tree, f, 0);
|
2082 |
#endif
|
2083 |
}
|
2084 |
|
2085 |
/*
|
2086 |
- dumpst - dump a subRE tree
|
2087 |
^ static VOID dumpst(struct subre *, FILE *, int);
|
2088 |
*/
|
2089 |
static VOID
|
2090 |
dumpst(t, f, nfapresent)
|
2091 |
struct subre *t;
|
2092 |
FILE *f;
|
2093 |
int nfapresent; /* is the original NFA still around? */
|
2094 |
{
|
2095 |
if (t == NULL)
|
2096 |
fprintf(f, "null tree\n");
|
2097 |
else
|
2098 |
stdump(t, f, nfapresent);
|
2099 |
fflush(f);
|
2100 |
}
|
2101 |
|
2102 |
/*
|
2103 |
- stdump - recursive guts of dumpst
|
2104 |
^ static VOID stdump(struct subre *, FILE *, int);
|
2105 |
*/
|
2106 |
static VOID
|
2107 |
stdump(t, f, nfapresent)
|
2108 |
struct subre *t;
|
2109 |
FILE *f;
|
2110 |
int nfapresent; /* is the original NFA still around? */
|
2111 |
{
|
2112 |
char idbuf[50];
|
2113 |
|
2114 |
fprintf(f, "%s. `%c'", stid(t, idbuf, sizeof(idbuf)), t->op);
|
2115 |
if (t->flags&LONGER)
|
2116 |
fprintf(f, " longest");
|
2117 |
if (t->flags&SHORTER)
|
2118 |
fprintf(f, " shortest");
|
2119 |
if (t->flags&MIXED)
|
2120 |
fprintf(f, " hasmixed");
|
2121 |
if (t->flags&CAP)
|
2122 |
fprintf(f, " hascapture");
|
2123 |
if (t->flags&BACKR)
|
2124 |
fprintf(f, " hasbackref");
|
2125 |
if (!(t->flags&INUSE))
|
2126 |
fprintf(f, " UNUSED");
|
2127 |
if (t->subno != 0)
|
2128 |
fprintf(f, " (#%d)", t->subno);
|
2129 |
if (t->min != 1 || t->max != 1) {
|
2130 |
fprintf(f, " {%d,", t->min);
|
2131 |
if (t->max != INFINITY)
|
2132 |
fprintf(f, "%d", t->max);
|
2133 |
fprintf(f, "}");
|
2134 |
}
|
2135 |
if (nfapresent)
|
2136 |
fprintf(f, " %ld-%ld", (long)t->begin->no, (long)t->end->no);
|
2137 |
if (t->left != NULL)
|
2138 |
fprintf(f, " L:%s", stid(t->left, idbuf, sizeof(idbuf)));
|
2139 |
if (t->right != NULL)
|
2140 |
fprintf(f, " R:%s", stid(t->right, idbuf, sizeof(idbuf)));
|
2141 |
if (!NULLCNFA(t->cnfa)) {
|
2142 |
fprintf(f, "\n");
|
2143 |
dumpcnfa(&t->cnfa, f);
|
2144 |
fprintf(f, "\n");
|
2145 |
}
|
2146 |
if (t->left != NULL)
|
2147 |
stdump(t->left, f, nfapresent);
|
2148 |
if (t->right != NULL)
|
2149 |
stdump(t->right, f, nfapresent);
|
2150 |
}
|
2151 |
|
2152 |
/*
|
2153 |
- stid - identify a subtree node for dumping
|
2154 |
^ static char *stid(struct subre *, char *, size_t);
|
2155 |
*/
|
2156 |
static char * /* points to buf or constant string */
|
2157 |
stid(t, buf, bufsize)
|
2158 |
struct subre *t;
|
2159 |
char *buf;
|
2160 |
size_t bufsize;
|
2161 |
{
|
2162 |
/* big enough for hex int or decimal t->retry? */
|
2163 |
if (bufsize < sizeof(int)*2 + 3 || bufsize < sizeof(t->retry)*3 + 1)
|
2164 |
return "unable";
|
2165 |
if (t->retry != 0)
|
2166 |
sprintf(buf, "%d", t->retry);
|
2167 |
else
|
2168 |
sprintf(buf, "0x%x", (int)t); /* may lose bits, that's okay */
|
2169 |
return buf;
|
2170 |
}
|
2171 |
|
2172 |
#include "regc_lex.c"
|
2173 |
#include "regc_color.c"
|
2174 |
#include "regc_nfa.c"
|
2175 |
#include "regc_cvec.c"
|
2176 |
#include "regc_locale.c"
|
2177 |
|
2178 |
/* End of regcomp.c */
|