1 |
/* $Header$ */ |
2 |
/* |
3 |
* re_*exec and friends - match REs |
4 |
* |
5 |
* Copyright (c) 1998, 1999 Henry Spencer. All rights reserved. |
6 |
* |
7 |
* Development of this software was funded, in part, by Cray Research Inc., |
8 |
* UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics |
9 |
* Corporation, none of whom are responsible for the results. The author |
10 |
* thanks all of them. |
11 |
* |
12 |
* Redistribution and use in source and binary forms -- with or without |
13 |
* modification -- are permitted for any purpose, provided that |
14 |
* redistributions in source form retain this entire copyright notice and |
15 |
* indicate the origin and nature of any modifications. |
16 |
* |
17 |
* I'd appreciate being given credit for this package in the documentation |
18 |
* of software which uses it, but that is not a requirement. |
19 |
* |
20 |
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, |
21 |
* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY |
22 |
* AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL |
23 |
* HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, |
24 |
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
25 |
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; |
26 |
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, |
27 |
* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR |
28 |
* OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF |
29 |
* ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
30 |
* |
31 |
*/ |
32 |
|
33 |
#include "regguts.h" |
34 |
|
35 |
|
36 |
|
37 |
/* lazy-DFA representation */ |
38 |
struct arcp { /* "pointer" to an outarc */ |
39 |
struct sset *ss; |
40 |
color co; |
41 |
}; |
42 |
|
43 |
struct sset { /* state set */ |
44 |
unsigned *states; /* pointer to bitvector */ |
45 |
unsigned hash; /* hash of bitvector */ |
46 |
# define HASH(bv, nw) (((nw) == 1) ? *(bv) : hash(bv, nw)) |
47 |
# define HIT(h,bv,ss,nw) ((ss)->hash == (h) && ((nw) == 1 || \ |
48 |
memcmp(VS(bv), VS((ss)->states), (nw)*sizeof(unsigned)) == 0)) |
49 |
int flags; |
50 |
# define STARTER 01 /* the initial state set */ |
51 |
# define POSTSTATE 02 /* includes the goal state */ |
52 |
# define LOCKED 04 /* locked in cache */ |
53 |
# define NOPROGRESS 010 /* zero-progress state set */ |
54 |
struct arcp ins; /* chain of inarcs pointing here */ |
55 |
chr *lastseen; /* last entered on arrival here */ |
56 |
struct sset **outs; /* outarc vector indexed by color */ |
57 |
struct arcp *inchain; /* chain-pointer vector for outarcs */ |
58 |
}; |
59 |
|
60 |
struct dfa { |
61 |
int nssets; /* size of cache */ |
62 |
int nssused; /* how many entries occupied yet */ |
63 |
int nstates; /* number of states */ |
64 |
int ncolors; /* length of outarc and inchain vectors */ |
65 |
int wordsper; /* length of state-set bitvectors */ |
66 |
struct sset *ssets; /* state-set cache */ |
67 |
unsigned *statesarea; /* bitvector storage */ |
68 |
unsigned *work; /* pointer to work area within statesarea */ |
69 |
struct sset **outsarea; /* outarc-vector storage */ |
70 |
struct arcp *incarea; /* inchain storage */ |
71 |
struct cnfa *cnfa; |
72 |
struct colormap *cm; |
73 |
chr *lastpost; /* location of last cache-flushed success */ |
74 |
chr *lastnopr; /* location of last cache-flushed NOPROGRESS */ |
75 |
struct sset *search; /* replacement-search-pointer memory */ |
76 |
int cptsmalloced; /* were the areas individually malloced? */ |
77 |
char *mallocarea; /* self, or master malloced area, or NULL */ |
78 |
}; |
79 |
|
80 |
#define WORK 1 /* number of work bitvectors needed */ |
81 |
|
82 |
/* setup for non-malloc allocation for small cases */ |
83 |
#define FEWSTATES 20 /* must be less than UBITS */ |
84 |
#define FEWCOLORS 15 |
85 |
struct smalldfa { |
86 |
struct dfa dfa; |
87 |
struct sset ssets[FEWSTATES*2]; |
88 |
unsigned statesarea[FEWSTATES*2 + WORK]; |
89 |
struct sset *outsarea[FEWSTATES*2 * FEWCOLORS]; |
90 |
struct arcp incarea[FEWSTATES*2 * FEWCOLORS]; |
91 |
}; |
92 |
#define DOMALLOC ((struct smalldfa *)NULL) /* force malloc */ |
93 |
|
94 |
|
95 |
|
96 |
/* internal variables, bundled for easy passing around */ |
97 |
struct vars { |
98 |
regex_t *re; |
99 |
struct guts *g; |
100 |
int eflags; /* copies of arguments */ |
101 |
size_t nmatch; |
102 |
regmatch_t *pmatch; |
103 |
rm_detail_t *details; |
104 |
chr *start; /* start of string */ |
105 |
chr *stop; /* just past end of string */ |
106 |
int err; /* error code if any (0 none) */ |
107 |
regoff_t *mem; /* memory vector for backtracking */ |
108 |
struct smalldfa dfa1; |
109 |
struct smalldfa dfa2; |
110 |
}; |
111 |
#define VISERR(vv) ((vv)->err != 0) /* have we seen an error yet? */ |
112 |
#define ISERR() VISERR(v) |
113 |
#define VERR(vv,e) (((vv)->err) ? (vv)->err : ((vv)->err = (e))) |
114 |
#define ERR(e) VERR(v, e) /* record an error */ |
115 |
#define NOERR() {if (ISERR()) return v->err;} /* if error seen, return it */ |
116 |
#define OFF(p) ((p) - v->start) |
117 |
#define LOFF(p) ((long)OFF(p)) |
118 |
|
119 |
|
120 |
|
121 |
/* |
122 |
* forward declarations |
123 |
*/ |
124 |
/* =====^!^===== begin forwards =====^!^===== */ |
125 |
/* automatically gathered by fwd; do not hand-edit */ |
126 |
/* === regexec.c === */ |
127 |
int exec _ANSI_ARGS_((regex_t *, CONST chr *, size_t, rm_detail_t *, size_t, regmatch_t [], int)); |
128 |
static int find _ANSI_ARGS_((struct vars *, struct cnfa *, struct colormap *)); |
129 |
static int cfind _ANSI_ARGS_((struct vars *, struct cnfa *, struct colormap *)); |
130 |
static int cfindloop _ANSI_ARGS_((struct vars *, struct cnfa *, struct colormap *, struct dfa *, struct dfa *, chr **)); |
131 |
static VOID zapsubs _ANSI_ARGS_((regmatch_t *, size_t)); |
132 |
static VOID zapmem _ANSI_ARGS_((struct vars *, struct subre *)); |
133 |
static VOID subset _ANSI_ARGS_((struct vars *, struct subre *, chr *, chr *)); |
134 |
static int dissect _ANSI_ARGS_((struct vars *, struct subre *, chr *, chr *)); |
135 |
static int condissect _ANSI_ARGS_((struct vars *, struct subre *, chr *, chr *)); |
136 |
static int altdissect _ANSI_ARGS_((struct vars *, struct subre *, chr *, chr *)); |
137 |
static int cdissect _ANSI_ARGS_((struct vars *, struct subre *, chr *, chr *)); |
138 |
static int ccondissect _ANSI_ARGS_((struct vars *, struct subre *, chr *, chr *)); |
139 |
static int crevdissect _ANSI_ARGS_((struct vars *, struct subre *, chr *, chr *)); |
140 |
static int cbrdissect _ANSI_ARGS_((struct vars *, struct subre *, chr *, chr *)); |
141 |
static int caltdissect _ANSI_ARGS_((struct vars *, struct subre *, chr *, chr *)); |
142 |
/* === rege_dfa.c === */ |
143 |
static chr *longest _ANSI_ARGS_((struct vars *, struct dfa *, chr *, chr *, int *)); |
144 |
static chr *shortest _ANSI_ARGS_((struct vars *, struct dfa *, chr *, chr *, chr *, chr **, int *)); |
145 |
static chr *lastcold _ANSI_ARGS_((struct vars *, struct dfa *)); |
146 |
static struct dfa *newdfa _ANSI_ARGS_((struct vars *, struct cnfa *, struct colormap *, struct smalldfa *)); |
147 |
static VOID freedfa _ANSI_ARGS_((struct dfa *)); |
148 |
static unsigned hash _ANSI_ARGS_((unsigned *, int)); |
149 |
static struct sset *initialize _ANSI_ARGS_((struct vars *, struct dfa *, chr *)); |
150 |
static struct sset *miss _ANSI_ARGS_((struct vars *, struct dfa *, struct sset *, pcolor, chr *, chr *)); |
151 |
static int lacon _ANSI_ARGS_((struct vars *, struct cnfa *, chr *, pcolor)); |
152 |
static struct sset *getvacant _ANSI_ARGS_((struct vars *, struct dfa *, chr *, chr *)); |
153 |
static struct sset *pickss _ANSI_ARGS_((struct vars *, struct dfa *, chr *, chr *)); |
154 |
/* automatically gathered by fwd; do not hand-edit */ |
155 |
/* =====^!^===== end forwards =====^!^===== */ |
156 |
|
157 |
|
158 |
|
159 |
/* |
160 |
- exec - match regular expression |
161 |
^ int exec(regex_t *, CONST chr *, size_t, rm_detail_t *, |
162 |
^ size_t, regmatch_t [], int); |
163 |
*/ |
164 |
int |
165 |
exec(re, string, len, details, nmatch, pmatch, flags) |
166 |
regex_t *re; |
167 |
CONST chr *string; |
168 |
size_t len; |
169 |
rm_detail_t *details; |
170 |
size_t nmatch; |
171 |
regmatch_t pmatch[]; |
172 |
int flags; |
173 |
{ |
174 |
struct vars var; |
175 |
register struct vars *v = &var; |
176 |
int st; |
177 |
size_t n; |
178 |
int backref; |
179 |
# define LOCALMAT 20 |
180 |
regmatch_t mat[LOCALMAT]; |
181 |
# define LOCALMEM 40 |
182 |
regoff_t mem[LOCALMEM]; |
183 |
|
184 |
/* sanity checks */ |
185 |
if (re == NULL || string == NULL || re->re_magic != REMAGIC) |
186 |
return REG_INVARG; |
187 |
if (re->re_csize != sizeof(chr)) |
188 |
return REG_MIXED; |
189 |
|
190 |
/* setup */ |
191 |
v->re = re; |
192 |
v->g = (struct guts *)re->re_guts; |
193 |
if ((v->g->cflags®_EXPECT) && details == NULL) |
194 |
return REG_INVARG; |
195 |
if (v->g->info®_UIMPOSSIBLE) |
196 |
return REG_NOMATCH; |
197 |
backref = (v->g->info®_UBACKREF) ? 1 : 0; |
198 |
v->eflags = flags; |
199 |
if (v->g->cflags®_NOSUB) |
200 |
nmatch = 0; /* override client */ |
201 |
v->nmatch = nmatch; |
202 |
if (backref) { |
203 |
/* need work area */ |
204 |
if (v->g->nsub + 1 <= LOCALMAT) |
205 |
v->pmatch = mat; |
206 |
else |
207 |
v->pmatch = (regmatch_t *)MALLOC((v->g->nsub + 1) * |
208 |
sizeof(regmatch_t)); |
209 |
if (v->pmatch == NULL) |
210 |
return REG_ESPACE; |
211 |
v->nmatch = v->g->nsub + 1; |
212 |
} else |
213 |
v->pmatch = pmatch; |
214 |
v->details = details; |
215 |
v->start = (chr *)string; |
216 |
v->stop = (chr *)string + len; |
217 |
v->err = 0; |
218 |
if (backref) { |
219 |
/* need retry memory */ |
220 |
assert(v->g->ntree >= 0); |
221 |
n = (size_t)v->g->ntree; |
222 |
if (n <= LOCALMEM) |
223 |
v->mem = mem; |
224 |
else |
225 |
v->mem = (regoff_t *)MALLOC(n*sizeof(regoff_t)); |
226 |
if (v->mem == NULL) { |
227 |
if (v->pmatch != pmatch && v->pmatch != mat) |
228 |
FREE(v->pmatch); |
229 |
return REG_ESPACE; |
230 |
} |
231 |
} else |
232 |
v->mem = NULL; |
233 |
|
234 |
/* do it */ |
235 |
assert(v->g->tree != NULL); |
236 |
if (backref) |
237 |
st = cfind(v, &v->g->tree->cnfa, &v->g->cmap); |
238 |
else |
239 |
st = find(v, &v->g->tree->cnfa, &v->g->cmap); |
240 |
|
241 |
/* copy (portion of) match vector over if necessary */ |
242 |
if (st == REG_OKAY && v->pmatch != pmatch && nmatch > 0) { |
243 |
zapsubs(pmatch, nmatch); |
244 |
n = (nmatch < v->nmatch) ? nmatch : v->nmatch; |
245 |
memcpy(VS(pmatch), VS(v->pmatch), n*sizeof(regmatch_t)); |
246 |
} |
247 |
|
248 |
/* clean up */ |
249 |
if (v->pmatch != pmatch && v->pmatch != mat) |
250 |
FREE(v->pmatch); |
251 |
if (v->mem != NULL && v->mem != mem) |
252 |
FREE(v->mem); |
253 |
return st; |
254 |
} |
255 |
|
256 |
/* |
257 |
- find - find a match for the main NFA (no-complications case) |
258 |
^ static int find(struct vars *, struct cnfa *, struct colormap *); |
259 |
*/ |
260 |
static int |
261 |
find(v, cnfa, cm) |
262 |
struct vars *v; |
263 |
struct cnfa *cnfa; |
264 |
struct colormap *cm; |
265 |
{ |
266 |
struct dfa *s; |
267 |
struct dfa *d; |
268 |
chr *begin; |
269 |
chr *end = NULL; |
270 |
chr *cold; |
271 |
chr *open; /* open and close of range of possible starts */ |
272 |
chr *close; |
273 |
int hitend; |
274 |
int shorter = (v->g->tree->flags&SHORTER) ? 1 : 0; |
275 |
|
276 |
/* first, a shot with the search RE */ |
277 |
s = newdfa(v, &v->g->search, cm, &v->dfa1); |
278 |
assert(!(ISERR() && s != NULL)); |
279 |
NOERR(); |
280 |
MDEBUG(("\nsearch at %ld\n", LOFF(v->start))); |
281 |
cold = NULL; |
282 |
close = shortest(v, s, v->start, v->start, v->stop, &cold, (int *)NULL); |
283 |
freedfa(s); |
284 |
NOERR(); |
285 |
if (v->g->cflags®_EXPECT) { |
286 |
assert(v->details != NULL); |
287 |
if (cold != NULL) |
288 |
v->details->rm_extend.rm_so = OFF(cold); |
289 |
else |
290 |
v->details->rm_extend.rm_so = OFF(v->stop); |
291 |
v->details->rm_extend.rm_eo = OFF(v->stop); /* unknown */ |
292 |
} |
293 |
if (close == NULL) /* not found */ |
294 |
return REG_NOMATCH; |
295 |
if (v->nmatch == 0) /* found, don't need exact location */ |
296 |
return REG_OKAY; |
297 |
|
298 |
/* find starting point and match */ |
299 |
assert(cold != NULL); |
300 |
open = cold; |
301 |
cold = NULL; |
302 |
MDEBUG(("between %ld and %ld\n", LOFF(open), LOFF(close))); |
303 |
d = newdfa(v, cnfa, cm, &v->dfa1); |
304 |
assert(!(ISERR() && d != NULL)); |
305 |
NOERR(); |
306 |
for (begin = open; begin <= close; begin++) { |
307 |
MDEBUG(("\nfind trying at %ld\n", LOFF(begin))); |
308 |
if (shorter) |
309 |
end = shortest(v, d, begin, begin, v->stop, |
310 |
(chr **)NULL, &hitend); |
311 |
else |
312 |
end = longest(v, d, begin, v->stop, &hitend); |
313 |
NOERR(); |
314 |
if (hitend && cold == NULL) |
315 |
cold = begin; |
316 |
if (end != NULL) |
317 |
break; /* NOTE BREAK OUT */ |
318 |
} |
319 |
assert(end != NULL); /* search RE succeeded so loop should */ |
320 |
freedfa(d); |
321 |
|
322 |
/* and pin down details */ |
323 |
assert(v->nmatch > 0); |
324 |
v->pmatch[0].rm_so = OFF(begin); |
325 |
v->pmatch[0].rm_eo = OFF(end); |
326 |
if (v->g->cflags®_EXPECT) { |
327 |
if (cold != NULL) |
328 |
v->details->rm_extend.rm_so = OFF(cold); |
329 |
else |
330 |
v->details->rm_extend.rm_so = OFF(v->stop); |
331 |
v->details->rm_extend.rm_eo = OFF(v->stop); /* unknown */ |
332 |
} |
333 |
if (v->nmatch == 1) /* no need for submatches */ |
334 |
return REG_OKAY; |
335 |
|
336 |
/* submatches */ |
337 |
zapsubs(v->pmatch, v->nmatch); |
338 |
return dissect(v, v->g->tree, begin, end); |
339 |
} |
340 |
|
341 |
/* |
342 |
- cfind - find a match for the main NFA (with complications) |
343 |
^ static int cfind(struct vars *, struct cnfa *, struct colormap *); |
344 |
*/ |
345 |
static int |
346 |
cfind(v, cnfa, cm) |
347 |
struct vars *v; |
348 |
struct cnfa *cnfa; |
349 |
struct colormap *cm; |
350 |
{ |
351 |
struct dfa *s; |
352 |
struct dfa *d; |
353 |
chr *cold; |
354 |
int ret; |
355 |
|
356 |
s = newdfa(v, &v->g->search, cm, &v->dfa1); |
357 |
NOERR(); |
358 |
d = newdfa(v, cnfa, cm, &v->dfa2); |
359 |
if (ISERR()) { |
360 |
assert(d == NULL); |
361 |
freedfa(s); |
362 |
return v->err; |
363 |
} |
364 |
|
365 |
ret = cfindloop(v, cnfa, cm, d, s, &cold); |
366 |
|
367 |
freedfa(d); |
368 |
freedfa(s); |
369 |
NOERR(); |
370 |
if (v->g->cflags®_EXPECT) { |
371 |
assert(v->details != NULL); |
372 |
if (cold != NULL) |
373 |
v->details->rm_extend.rm_so = OFF(cold); |
374 |
else |
375 |
v->details->rm_extend.rm_so = OFF(v->stop); |
376 |
v->details->rm_extend.rm_eo = OFF(v->stop); /* unknown */ |
377 |
} |
378 |
return ret; |
379 |
} |
380 |
|
381 |
/* |
382 |
- cfindloop - the heart of cfind |
383 |
^ static int cfindloop(struct vars *, struct cnfa *, struct colormap *, |
384 |
^ struct dfa *, struct dfa *, chr **); |
385 |
*/ |
386 |
static int |
387 |
cfindloop(v, cnfa, cm, d, s, coldp) |
388 |
struct vars *v; |
389 |
struct cnfa *cnfa; |
390 |
struct colormap *cm; |
391 |
struct dfa *d; |
392 |
struct dfa *s; |
393 |
chr **coldp; /* where to put coldstart pointer */ |
394 |
{ |
395 |
chr *begin; |
396 |
chr *end; |
397 |
chr *cold; |
398 |
chr *open; /* open and close of range of possible starts */ |
399 |
chr *close; |
400 |
chr *estart; |
401 |
chr *estop; |
402 |
int er; |
403 |
int shorter = v->g->tree->flags&SHORTER; |
404 |
int hitend; |
405 |
|
406 |
assert(d != NULL && s != NULL); |
407 |
cold = NULL; |
408 |
close = v->start; |
409 |
do { |
410 |
MDEBUG(("\ncsearch at %ld\n", LOFF(close))); |
411 |
close = shortest(v, s, close, close, v->stop, &cold, (int *)NULL); |
412 |
if (close == NULL) |
413 |
break; /* NOTE BREAK */ |
414 |
assert(cold != NULL); |
415 |
open = cold; |
416 |
cold = NULL; |
417 |
MDEBUG(("cbetween %ld and %ld\n", LOFF(open), LOFF(close))); |
418 |
for (begin = open; begin <= close; begin++) { |
419 |
MDEBUG(("\ncfind trying at %ld\n", LOFF(begin))); |
420 |
estart = begin; |
421 |
estop = v->stop; |
422 |
for (;;) { |
423 |
if (shorter) |
424 |
end = shortest(v, d, begin, estart, |
425 |
estop, (chr **)NULL, &hitend); |
426 |
else |
427 |
end = longest(v, d, begin, estop, |
428 |
&hitend); |
429 |
if (hitend && cold == NULL) |
430 |
cold = begin; |
431 |
if (end == NULL) |
432 |
break; /* NOTE BREAK OUT */ |
433 |
MDEBUG(("tentative end %ld\n", LOFF(end))); |
434 |
zapsubs(v->pmatch, v->nmatch); |
435 |
zapmem(v, v->g->tree); |
436 |
er = cdissect(v, v->g->tree, begin, end); |
437 |
if (er == REG_OKAY) { |
438 |
if (v->nmatch > 0) { |
439 |
v->pmatch[0].rm_so = OFF(begin); |
440 |
v->pmatch[0].rm_eo = OFF(end); |
441 |
} |
442 |
*coldp = cold; |
443 |
return REG_OKAY; |
444 |
} |
445 |
if (er != REG_NOMATCH) { |
446 |
ERR(er); |
447 |
return er; |
448 |
} |
449 |
if ((shorter) ? end == estop : end == begin) { |
450 |
/* no point in trying again */ |
451 |
*coldp = cold; |
452 |
return REG_NOMATCH; |
453 |
} |
454 |
/* go around and try again */ |
455 |
if (shorter) |
456 |
estart = end + 1; |
457 |
else |
458 |
estop = end - 1; |
459 |
} |
460 |
} |
461 |
} while (close < v->stop); |
462 |
|
463 |
*coldp = cold; |
464 |
return REG_NOMATCH; |
465 |
} |
466 |
|
467 |
/* |
468 |
- zapsubs - initialize the subexpression matches to "no match" |
469 |
^ static VOID zapsubs(regmatch_t *, size_t); |
470 |
*/ |
471 |
static VOID |
472 |
zapsubs(p, n) |
473 |
regmatch_t *p; |
474 |
size_t n; |
475 |
{ |
476 |
size_t i; |
477 |
|
478 |
for (i = n-1; i > 0; i--) { |
479 |
p[i].rm_so = -1; |
480 |
p[i].rm_eo = -1; |
481 |
} |
482 |
} |
483 |
|
484 |
/* |
485 |
- zapmem - initialize the retry memory of a subtree to zeros |
486 |
^ static VOID zapmem(struct vars *, struct subre *); |
487 |
*/ |
488 |
static VOID |
489 |
zapmem(v, t) |
490 |
struct vars *v; |
491 |
struct subre *t; |
492 |
{ |
493 |
if (t == NULL) |
494 |
return; |
495 |
|
496 |
assert(v->mem != NULL); |
497 |
v->mem[t->retry] = 0; |
498 |
if (t->op == '(') { |
499 |
assert(t->subno > 0); |
500 |
v->pmatch[t->subno].rm_so = -1; |
501 |
v->pmatch[t->subno].rm_eo = -1; |
502 |
} |
503 |
|
504 |
if (t->left != NULL) |
505 |
zapmem(v, t->left); |
506 |
if (t->right != NULL) |
507 |
zapmem(v, t->right); |
508 |
} |
509 |
|
510 |
/* |
511 |
- subset - set any subexpression relevant to a successful subre |
512 |
^ static VOID subset(struct vars *, struct subre *, chr *, chr *); |
513 |
*/ |
514 |
static VOID |
515 |
subset(v, sub, begin, end) |
516 |
struct vars *v; |
517 |
struct subre *sub; |
518 |
chr *begin; |
519 |
chr *end; |
520 |
{ |
521 |
int n = sub->subno; |
522 |
|
523 |
assert(n > 0); |
524 |
if ((size_t)n >= v->nmatch) |
525 |
return; |
526 |
|
527 |
MDEBUG(("setting %d\n", n)); |
528 |
v->pmatch[n].rm_so = OFF(begin); |
529 |
v->pmatch[n].rm_eo = OFF(end); |
530 |
} |
531 |
|
532 |
/* |
533 |
- dissect - determine subexpression matches (uncomplicated case) |
534 |
^ static int dissect(struct vars *, struct subre *, chr *, chr *); |
535 |
*/ |
536 |
static int /* regexec return code */ |
537 |
dissect(v, t, begin, end) |
538 |
struct vars *v; |
539 |
struct subre *t; |
540 |
chr *begin; /* beginning of relevant substring */ |
541 |
chr *end; /* end of same */ |
542 |
{ |
543 |
assert(t != NULL); |
544 |
MDEBUG(("dissect %ld-%ld\n", LOFF(begin), LOFF(end))); |
545 |
|
546 |
switch (t->op) { |
547 |
case '=': /* terminal node */ |
548 |
assert(t->left == NULL && t->right == NULL); |
549 |
return REG_OKAY; /* no action, parent did the work */ |
550 |
break; |
551 |
case '|': /* alternation */ |
552 |
assert(t->left != NULL); |
553 |
return altdissect(v, t, begin, end); |
554 |
break; |
555 |
case 'b': /* back ref -- shouldn't be calling us! */ |
556 |
return REG_ASSERT; |
557 |
break; |
558 |
case '.': /* concatenation */ |
559 |
assert(t->left != NULL && t->right != NULL); |
560 |
return condissect(v, t, begin, end); |
561 |
break; |
562 |
case '(': /* capturing */ |
563 |
assert(t->left != NULL && t->right == NULL); |
564 |
assert(t->subno > 0); |
565 |
subset(v, t, begin, end); |
566 |
return dissect(v, t->left, begin, end); |
567 |
break; |
568 |
default: |
569 |
return REG_ASSERT; |
570 |
break; |
571 |
} |
572 |
} |
573 |
|
574 |
/* |
575 |
- condissect - determine concatenation subexpression matches (uncomplicated) |
576 |
^ static int condissect(struct vars *, struct subre *, chr *, chr *); |
577 |
*/ |
578 |
static int /* regexec return code */ |
579 |
condissect(v, t, begin, end) |
580 |
struct vars *v; |
581 |
struct subre *t; |
582 |
chr *begin; /* beginning of relevant substring */ |
583 |
chr *end; /* end of same */ |
584 |
{ |
585 |
struct dfa *d; |
586 |
struct dfa *d2; |
587 |
chr *mid; |
588 |
int i; |
589 |
int shorter = (t->left->flags&SHORTER) ? 1 : 0; |
590 |
chr *stop = (shorter) ? end : begin; |
591 |
|
592 |
assert(t->op == '.'); |
593 |
assert(t->left != NULL && t->left->cnfa.nstates > 0); |
594 |
assert(t->right != NULL && t->right->cnfa.nstates > 0); |
595 |
|
596 |
d = newdfa(v, &t->left->cnfa, &v->g->cmap, &v->dfa1); |
597 |
NOERR(); |
598 |
d2 = newdfa(v, &t->right->cnfa, &v->g->cmap, &v->dfa2); |
599 |
if (ISERR()) { |
600 |
assert(d2 == NULL); |
601 |
freedfa(d); |
602 |
return v->err; |
603 |
} |
604 |
|
605 |
/* pick a tentative midpoint */ |
606 |
if (shorter) |
607 |
mid = shortest(v, d, begin, begin, end, (chr **)NULL, |
608 |
(int *)NULL); |
609 |
else |
610 |
mid = longest(v, d, begin, end, (int *)NULL); |
611 |
if (mid == NULL) { |
612 |
freedfa(d); |
613 |
freedfa(d2); |
614 |
return REG_ASSERT; |
615 |
} |
616 |
MDEBUG(("tentative midpoint %ld\n", LOFF(mid))); |
617 |
|
618 |
/* iterate until satisfaction or failure */ |
619 |
while (longest(v, d2, mid, end, (int *)NULL) != end) { |
620 |
/* that midpoint didn't work, find a new one */ |
621 |
if (mid == stop) { |
622 |
/* all possibilities exhausted! */ |
623 |
MDEBUG(("no midpoint!\n")); |
624 |
freedfa(d); |
625 |
freedfa(d2); |
626 |
return REG_ASSERT; |
627 |
} |
628 |
if (shorter) |
629 |
mid = shortest(v, d, begin, mid+1, end, (chr **)NULL, |
630 |
(int *)NULL); |
631 |
else |
632 |
mid = longest(v, d, begin, mid-1, (int *)NULL); |
633 |
if (mid == NULL) { |
634 |
/* failed to find a new one! */ |
635 |
MDEBUG(("failed midpoint!\n")); |
636 |
freedfa(d); |
637 |
freedfa(d2); |
638 |
return REG_ASSERT; |
639 |
} |
640 |
MDEBUG(("new midpoint %ld\n", LOFF(mid))); |
641 |
} |
642 |
|
643 |
/* satisfaction */ |
644 |
MDEBUG(("successful\n")); |
645 |
freedfa(d); |
646 |
freedfa(d2); |
647 |
i = dissect(v, t->left, begin, mid); |
648 |
if (i != REG_OKAY) |
649 |
return i; |
650 |
return dissect(v, t->right, mid, end); |
651 |
} |
652 |
|
653 |
/* |
654 |
- altdissect - determine alternative subexpression matches (uncomplicated) |
655 |
^ static int altdissect(struct vars *, struct subre *, chr *, chr *); |
656 |
*/ |
657 |
static int /* regexec return code */ |
658 |
altdissect(v, t, begin, end) |
659 |
struct vars *v; |
660 |
struct subre *t; |
661 |
chr *begin; /* beginning of relevant substring */ |
662 |
chr *end; /* end of same */ |
663 |
{ |
664 |
struct dfa *d; |
665 |
int i; |
666 |
|
667 |
assert(t != NULL); |
668 |
assert(t->op == '|'); |
669 |
|
670 |
for (i = 0; t != NULL; t = t->right, i++) { |
671 |
MDEBUG(("trying %dth\n", i)); |
672 |
assert(t->left != NULL && t->left->cnfa.nstates > 0); |
673 |
d = newdfa(v, &t->left->cnfa, &v->g->cmap, &v->dfa1); |
674 |
if (ISERR()) |
675 |
return v->err; |
676 |
if (longest(v, d, begin, end, (int *)NULL) == end) { |
677 |
MDEBUG(("success\n")); |
678 |
freedfa(d); |
679 |
return dissect(v, t->left, begin, end); |
680 |
} |
681 |
freedfa(d); |
682 |
} |
683 |
return REG_ASSERT; /* none of them matched?!? */ |
684 |
} |
685 |
|
686 |
/* |
687 |
- cdissect - determine subexpression matches (with complications) |
688 |
* The retry memory stores the offset of the trial midpoint from begin, |
689 |
* plus 1 so that 0 uniquely means "clean slate". |
690 |
^ static int cdissect(struct vars *, struct subre *, chr *, chr *); |
691 |
*/ |
692 |
static int /* regexec return code */ |
693 |
cdissect(v, t, begin, end) |
694 |
struct vars *v; |
695 |
struct subre *t; |
696 |
chr *begin; /* beginning of relevant substring */ |
697 |
chr *end; /* end of same */ |
698 |
{ |
699 |
int er; |
700 |
|
701 |
assert(t != NULL); |
702 |
MDEBUG(("cdissect %ld-%ld %c\n", LOFF(begin), LOFF(end), t->op)); |
703 |
|
704 |
switch (t->op) { |
705 |
case '=': /* terminal node */ |
706 |
assert(t->left == NULL && t->right == NULL); |
707 |
return REG_OKAY; /* no action, parent did the work */ |
708 |
break; |
709 |
case '|': /* alternation */ |
710 |
assert(t->left != NULL); |
711 |
return caltdissect(v, t, begin, end); |
712 |
break; |
713 |
case 'b': /* back ref -- shouldn't be calling us! */ |
714 |
assert(t->left == NULL && t->right == NULL); |
715 |
return cbrdissect(v, t, begin, end); |
716 |
break; |
717 |
case '.': /* concatenation */ |
718 |
assert(t->left != NULL && t->right != NULL); |
719 |
return ccondissect(v, t, begin, end); |
720 |
break; |
721 |
case '(': /* capturing */ |
722 |
assert(t->left != NULL && t->right == NULL); |
723 |
assert(t->subno > 0); |
724 |
er = cdissect(v, t->left, begin, end); |
725 |
if (er == REG_OKAY) |
726 |
subset(v, t, begin, end); |
727 |
return er; |
728 |
break; |
729 |
default: |
730 |
return REG_ASSERT; |
731 |
break; |
732 |
} |
733 |
} |
734 |
|
735 |
/* |
736 |
- ccondissect - concatenation subexpression matches (with complications) |
737 |
* The retry memory stores the offset of the trial midpoint from begin, |
738 |
* plus 1 so that 0 uniquely means "clean slate". |
739 |
^ static int ccondissect(struct vars *, struct subre *, chr *, chr *); |
740 |
*/ |
741 |
static int /* regexec return code */ |
742 |
ccondissect(v, t, begin, end) |
743 |
struct vars *v; |
744 |
struct subre *t; |
745 |
chr *begin; /* beginning of relevant substring */ |
746 |
chr *end; /* end of same */ |
747 |
{ |
748 |
struct dfa *d; |
749 |
struct dfa *d2; |
750 |
chr *mid; |
751 |
int er; |
752 |
|
753 |
assert(t->op == '.'); |
754 |
assert(t->left != NULL && t->left->cnfa.nstates > 0); |
755 |
assert(t->right != NULL && t->right->cnfa.nstates > 0); |
756 |
|
757 |
if (t->left->flags&SHORTER) /* reverse scan */ |
758 |
return crevdissect(v, t, begin, end); |
759 |
|
760 |
d = newdfa(v, &t->left->cnfa, &v->g->cmap, DOMALLOC); |
761 |
if (ISERR()) |
762 |
return v->err; |
763 |
d2 = newdfa(v, &t->right->cnfa, &v->g->cmap, DOMALLOC); |
764 |
if (ISERR()) { |
765 |
freedfa(d); |
766 |
return v->err; |
767 |
} |
768 |
MDEBUG(("cconcat %d\n", t->retry)); |
769 |
|
770 |
/* pick a tentative midpoint */ |
771 |
if (v->mem[t->retry] == 0) { |
772 |
mid = longest(v, d, begin, end, (int *)NULL); |
773 |
if (mid == NULL) { |
774 |
freedfa(d); |
775 |
freedfa(d2); |
776 |
return REG_NOMATCH; |
777 |
} |
778 |
MDEBUG(("tentative midpoint %ld\n", LOFF(mid))); |
779 |
v->mem[t->retry] = (mid - begin) + 1; |
780 |
} else { |
781 |
mid = begin + (v->mem[t->retry] - 1); |
782 |
MDEBUG(("working midpoint %ld\n", LOFF(mid))); |
783 |
} |
784 |
|
785 |
/* iterate until satisfaction or failure */ |
786 |
for (;;) { |
787 |
/* try this midpoint on for size */ |
788 |
er = cdissect(v, t->left, begin, mid); |
789 |
if (er == REG_OKAY && |
790 |
longest(v, d2, mid, end, (int *)NULL) == end && |
791 |
(er = cdissect(v, t->right, mid, end)) == |
792 |
REG_OKAY) |
793 |
break; /* NOTE BREAK OUT */ |
794 |
if (er != REG_OKAY && er != REG_NOMATCH) { |
795 |
freedfa(d); |
796 |
freedfa(d2); |
797 |
return er; |
798 |
} |
799 |
|
800 |
/* that midpoint didn't work, find a new one */ |
801 |
if (mid == begin) { |
802 |
/* all possibilities exhausted */ |
803 |
MDEBUG(("%d no midpoint\n", t->retry)); |
804 |
freedfa(d); |
805 |
freedfa(d2); |
806 |
return REG_NOMATCH; |
807 |
} |
808 |
mid = longest(v, d, begin, mid-1, (int *)NULL); |
809 |
if (mid == NULL) { |
810 |
/* failed to find a new one */ |
811 |
MDEBUG(("%d failed midpoint\n", t->retry)); |
812 |
freedfa(d); |
813 |
freedfa(d2); |
814 |
return REG_NOMATCH; |
815 |
} |
816 |
MDEBUG(("%d: new midpoint %ld\n", t->retry, LOFF(mid))); |
817 |
v->mem[t->retry] = (mid - begin) + 1; |
818 |
zapmem(v, t->left); |
819 |
zapmem(v, t->right); |
820 |
} |
821 |
|
822 |
/* satisfaction */ |
823 |
MDEBUG(("successful\n")); |
824 |
freedfa(d); |
825 |
freedfa(d2); |
826 |
return REG_OKAY; |
827 |
} |
828 |
|
829 |
/* |
830 |
- crevdissect - determine backref shortest-first subexpression matches |
831 |
* The retry memory stores the offset of the trial midpoint from begin, |
832 |
* plus 1 so that 0 uniquely means "clean slate". |
833 |
^ static int crevdissect(struct vars *, struct subre *, chr *, chr *); |
834 |
*/ |
835 |
static int /* regexec return code */ |
836 |
crevdissect(v, t, begin, end) |
837 |
struct vars *v; |
838 |
struct subre *t; |
839 |
chr *begin; /* beginning of relevant substring */ |
840 |
chr *end; /* end of same */ |
841 |
{ |
842 |
struct dfa *d; |
843 |
struct dfa *d2; |
844 |
chr *mid; |
845 |
int er; |
846 |
|
847 |
assert(t->op == '.'); |
848 |
assert(t->left != NULL && t->left->cnfa.nstates > 0); |
849 |
assert(t->right != NULL && t->right->cnfa.nstates > 0); |
850 |
assert(t->left->flags&SHORTER); |
851 |
|
852 |
/* concatenation -- need to split the substring between parts */ |
853 |
d = newdfa(v, &t->left->cnfa, &v->g->cmap, DOMALLOC); |
854 |
if (ISERR()) |
855 |
return v->err; |
856 |
d2 = newdfa(v, &t->right->cnfa, &v->g->cmap, DOMALLOC); |
857 |
if (ISERR()) { |
858 |
freedfa(d); |
859 |
return v->err; |
860 |
} |
861 |
MDEBUG(("crev %d\n", t->retry)); |
862 |
|
863 |
/* pick a tentative midpoint */ |
864 |
if (v->mem[t->retry] == 0) { |
865 |
mid = shortest(v, d, begin, begin, end, (chr **)NULL, (int *)NULL); |
866 |
if (mid == NULL) { |
867 |
freedfa(d); |
868 |
freedfa(d2); |
869 |
return REG_NOMATCH; |
870 |
} |
871 |
MDEBUG(("tentative midpoint %ld\n", LOFF(mid))); |
872 |
v->mem[t->retry] = (mid - begin) + 1; |
873 |
} else { |
874 |
mid = begin + (v->mem[t->retry] - 1); |
875 |
MDEBUG(("working midpoint %ld\n", LOFF(mid))); |
876 |
} |
877 |
|
878 |
/* iterate until satisfaction or failure */ |
879 |
for (;;) { |
880 |
/* try this midpoint on for size */ |
881 |
er = cdissect(v, t->left, begin, mid); |
882 |
if (er == REG_OKAY && |
883 |
longest(v, d2, mid, end, (int *)NULL) == end && |
884 |
(er = cdissect(v, t->right, mid, end)) == |
885 |
REG_OKAY) |
886 |
break; /* NOTE BREAK OUT */ |
887 |
if (er != REG_OKAY && er != REG_NOMATCH) { |
888 |
freedfa(d); |
889 |
freedfa(d2); |
890 |
return er; |
891 |
} |
892 |
|
893 |
/* that midpoint didn't work, find a new one */ |
894 |
if (mid == end) { |
895 |
/* all possibilities exhausted */ |
896 |
MDEBUG(("%d no midpoint\n", t->retry)); |
897 |
freedfa(d); |
898 |
freedfa(d2); |
899 |
return REG_NOMATCH; |
900 |
} |
901 |
mid = shortest(v, d, begin, mid+1, end, (chr **)NULL, (int *)NULL); |
902 |
if (mid == NULL) { |
903 |
/* failed to find a new one */ |
904 |
MDEBUG(("%d failed midpoint\n", t->retry)); |
905 |
freedfa(d); |
906 |
freedfa(d2); |
907 |
return REG_NOMATCH; |
908 |
} |
909 |
MDEBUG(("%d: new midpoint %ld\n", t->retry, LOFF(mid))); |
910 |
v->mem[t->retry] = (mid - begin) + 1; |
911 |
zapmem(v, t->left); |
912 |
zapmem(v, t->right); |
913 |
} |
914 |
|
915 |
/* satisfaction */ |
916 |
MDEBUG(("successful\n")); |
917 |
freedfa(d); |
918 |
freedfa(d2); |
919 |
return REG_OKAY; |
920 |
} |
921 |
|
922 |
/* |
923 |
- cbrdissect - determine backref subexpression matches |
924 |
^ static int cbrdissect(struct vars *, struct subre *, chr *, chr *); |
925 |
*/ |
926 |
static int /* regexec return code */ |
927 |
cbrdissect(v, t, begin, end) |
928 |
struct vars *v; |
929 |
struct subre *t; |
930 |
chr *begin; /* beginning of relevant substring */ |
931 |
chr *end; /* end of same */ |
932 |
{ |
933 |
int i; |
934 |
int n = t->subno; |
935 |
size_t len; |
936 |
chr *paren; |
937 |
chr *p; |
938 |
chr *stop; |
939 |
int min = t->min; |
940 |
int max = t->max; |
941 |
|
942 |
assert(t != NULL); |
943 |
assert(t->op == 'b'); |
944 |
assert(n >= 0); |
945 |
assert((size_t)n < v->nmatch); |
946 |
|
947 |
MDEBUG(("cbackref n%d %d{%d-%d}\n", t->retry, n, min, max)); |
948 |
|
949 |
if (v->pmatch[n].rm_so == -1) |
950 |
return REG_NOMATCH; |
951 |
paren = v->start + v->pmatch[n].rm_so; |
952 |
len = v->pmatch[n].rm_eo - v->pmatch[n].rm_so; |
953 |
|
954 |
/* no room to maneuver -- retries are pointless */ |
955 |
if (v->mem[t->retry]) |
956 |
return REG_NOMATCH; |
957 |
v->mem[t->retry] = 1; |
958 |
|
959 |
/* special-case zero-length string */ |
960 |
if (len == 0) { |
961 |
if (begin == end) |
962 |
return REG_OKAY; |
963 |
return REG_NOMATCH; |
964 |
} |
965 |
|
966 |
/* and too-short string */ |
967 |
assert(end >= begin); |
968 |
if ((size_t)(end - begin) < len) |
969 |
return REG_NOMATCH; |
970 |
stop = end - len; |
971 |
|
972 |
/* count occurrences */ |
973 |
i = 0; |
974 |
for (p = begin; p <= stop && (i < max || max == INFINITY); p += len) { |
975 |
if ((*v->g->compare)(paren, p, len) != 0) |
976 |
break; |
977 |
i++; |
978 |
} |
979 |
MDEBUG(("cbackref found %d\n", i)); |
980 |
|
981 |
/* and sort it out */ |
982 |
if (p != end) /* didn't consume all of it */ |
983 |
return REG_NOMATCH; |
984 |
if (min <= i && (i <= max || max == INFINITY)) |
985 |
return REG_OKAY; |
986 |
return REG_NOMATCH; /* out of range */ |
987 |
} |
988 |
|
989 |
/* |
990 |
- caltdissect - determine alternative subexpression matches (w. complications) |
991 |
^ static int caltdissect(struct vars *, struct subre *, chr *, chr *); |
992 |
*/ |
993 |
static int /* regexec return code */ |
994 |
caltdissect(v, t, begin, end) |
995 |
struct vars *v; |
996 |
struct subre *t; |
997 |
chr *begin; /* beginning of relevant substring */ |
998 |
chr *end; /* end of same */ |
999 |
{ |
1000 |
struct dfa *d; |
1001 |
int er; |
1002 |
# define UNTRIED 0 /* not yet tried at all */ |
1003 |
# define TRYING 1 /* top matched, trying submatches */ |
1004 |
# define TRIED 2 /* top didn't match or submatches exhausted */ |
1005 |
|
1006 |
if (t == NULL) |
1007 |
return REG_NOMATCH; |
1008 |
assert(t->op == '|'); |
1009 |
if (v->mem[t->retry] == TRIED) |
1010 |
return caltdissect(v, t->right, begin, end); |
1011 |
|
1012 |
MDEBUG(("calt n%d\n", t->retry)); |
1013 |
assert(t->left != NULL); |
1014 |
|
1015 |
if (v->mem[t->retry] == UNTRIED) { |
1016 |
d = newdfa(v, &t->left->cnfa, &v->g->cmap, DOMALLOC); |
1017 |
if (ISERR()) |
1018 |
return v->err; |
1019 |
if (longest(v, d, begin, end, (int *)NULL) != end) { |
1020 |
freedfa(d); |
1021 |
v->mem[t->retry] = TRIED; |
1022 |
return caltdissect(v, t->right, begin, end); |
1023 |
} |
1024 |
freedfa(d); |
1025 |
MDEBUG(("calt matched\n")); |
1026 |
v->mem[t->retry] = TRYING; |
1027 |
} |
1028 |
|
1029 |
er = cdissect(v, t->left, begin, end); |
1030 |
if (er != REG_NOMATCH) |
1031 |
return er; |
1032 |
|
1033 |
v->mem[t->retry] = TRIED; |
1034 |
return caltdissect(v, t->right, begin, end); |
1035 |
} |
1036 |
|
1037 |
|
1038 |
#include "rege_dfa.c" |
1039 |
|
1040 |
/* End of regexec.c */ |