>From da0e8454a8e68035ef4b87dbb9097f85df6ece27 Mon Sep 17 00:00:00 2001 From: Norihiro Tanaka Date: Sat, 12 Sep 2020 18:51:55 -0700 Subject: [PATCH 1/7] dfa: use backward set in removal of epsilon closure When removing in epsilon closure, the code searched all nodes sequentially, and this was slow for some cases. Build a backward set before search, and only check previous position with the set. Problem reported in . * lib/dfa.c (struct dfa): New member 'epsilon'. (addtok_mb): Check whether a pattern has epsilon node or not. (epsclosure): New arg BACKWORD; caller changed. When removing epsilon node and reconnecting, check only previous positions. Treat BEG as if it were character. (dfaanalyze): Build backward set. --- lib/dfa.c | 65 +++++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 51 insertions(+), 14 deletions(-) diff --git a/lib/dfa.c b/lib/dfa.c index 1f0587a7a..7c8eb6685 100644 --- a/lib/dfa.c +++ b/lib/dfa.c @@ -488,6 +488,7 @@ struct dfa bool fast; /* The DFA is fast. */ token utf8_anychar_classes[9]; /* To lower ANYCHAR in UTF-8 locales. */ mbstate_t mbs; /* Multibyte conversion state. */ + bool epsilon; /* The following are valid only if MB_CUR_MAX > 1. */ @@ -1615,13 +1616,21 @@ addtok_mb (struct dfa *dfa, token t, char mbprop) dfa->parse.depth--; break; - case BACKREF: - dfa->fast = false; + case BEGLINE: + case ENDLINE: + case BEGWORD: + case ENDWORD: + case LIMWORD: + case NOTLIMWORD: + case EMPTY: + dfa->epsilon = true; FALLTHROUGH; + default: - dfa->nleaves++; - FALLTHROUGH; - case EMPTY: + if (t == BACKREF) + dfa->fast = false; + if (t != EMPTY) + dfa->nleaves++; dfa->parse.depth++; break; } @@ -2297,14 +2306,15 @@ state_index (struct dfa *d, position_set const *s, int context) constraint. Repeat exhaustively until no funny positions are left. S->elems must be large enough to hold the result. */ static void -epsclosure (struct dfa const *d) +epsclosure (struct dfa const *d, position_set *backword) { position_set tmp; alloc_position_set (&tmp, d->nleaves); for (idx_t i = 0; i < d->tindex; i++) if (d->follows[i].nelem > 0 && d->tokens[i] >= NOTCHAR && d->tokens[i] != BACKREF && d->tokens[i] != ANYCHAR - && d->tokens[i] != MBCSET && d->tokens[i] < CSET) + && d->tokens[i] != MBCSET && d->tokens[i] < CSET + && d->tokens[i] != BEG) { unsigned int constraint; switch (d->tokens[i]) @@ -2327,16 +2337,21 @@ epsclosure (struct dfa const *d) case NOTLIMWORD: constraint = NOTLIMWORD_CONSTRAINT; break; - default: + case EMPTY: constraint = NO_CONSTRAINT; break; + default: + abort (); } delete (i, &d->follows[i]); - for (idx_t j = 0; j < d->tindex; j++) - if (i != j && d->follows[j].nelem > 0) - replace (&d->follows[j], i, &d->follows[i], constraint, &tmp); + for (idx_t j = 0; j < backword[i].nelem; j++) + replace (&d->follows[backword[i].elems[j].index], i, &d->follows[i], + constraint, &tmp); + for (idx_t j = 0; j < d->follows[i].nelem; j++) + replace (&backword[d->follows[i].elems[j].index], i, &backword[i], + NO_CONSTRAINT, &tmp); } free (tmp.elems); } @@ -2643,6 +2658,7 @@ dfaanalyze (struct dfa *d, bool searchflag) /* Firstpos and lastpos elements. */ position *firstpos = posalloc; position *lastpos = firstpos + d->nleaves; + position_set *backword = NULL; position pos; position_set tmp; @@ -2675,6 +2691,9 @@ dfaanalyze (struct dfa *d, bool searchflag) alloc_position_set (&merged, d->nleaves); d->follows = xcalloc (d->tindex, sizeof *d->follows); + if (d->epsilon) + backword = xcalloc (d->tindex, sizeof *backword); + for (idx_t i = 0; i < d->tindex; i++) { switch (d->tokens[i]) @@ -2712,6 +2731,17 @@ dfaanalyze (struct dfa *d, bool searchflag) case CAT: /* Every element in the firstpos of the second argument is in the follow of every element in the lastpos of the first argument. */ + if (d->epsilon) + { + tmp.nelem = stk[-2].nlastpos; + tmp.elems = lastpos - stk[-1].nlastpos - stk[-2].nlastpos; + position *p = firstpos - stk[-1].nfirstpos; + for (idx_t j = 0; j < stk[-1].nfirstpos; j++) + { + merge (&tmp, &backword[p[j].index], &merged); + copy (&merged, &backword[p[j].index]); + } + } { tmp.nelem = stk[-1].nfirstpos; tmp.elems = firstpos - stk[-1].nfirstpos; @@ -2801,9 +2831,15 @@ dfaanalyze (struct dfa *d, bool searchflag) #endif } - /* For each follow set that is the follow set of a real position, replace - it with its epsilon closure. */ - epsclosure (d); + if (d->epsilon) + { + /* For each follow set that is the follow set of a real position, + replace it with its epsilon closure. */ + epsclosure (d, backword); + + for (idx_t i = 0; i < d->tindex; i++) + free (backword[i].elems); + } dfaoptimize (d); @@ -2865,6 +2901,7 @@ dfaanalyze (struct dfa *d, bool searchflag) d->min_trcount++; d->trcount = 0; + free (backword); free (posalloc); free (stkalloc); free (merged.elems); -- 2.17.1