From fffe326a7cd7d452f86ea51cae6fe08168a1391e Mon Sep 17 00:00:00 2001 From: Norihiro Tanaka Date: Sun, 19 Apr 2020 11:01:18 +0900 Subject: [PATCH] dfa: use backword set in removal of epsilon closure When remove epsilon closure, so far searched nodes including epsilon closure in all nodes sequentially, but it is slow for some cases. Now build backword set before search, and only check previos position with the set. Problem reported in: https://debbugs.gnu.org/cgi/bugreport.cgi?bug=40634 * lib/dfa.c (struct dfa): New member 'epsilon'. (addtok_mb): Check whether a pattern has epsilon node or not. (epsclosure): When remove epsilon node and reconnect, only check previos positions. (dfaanalyze): Build backword set. --- lib/dfa.c | 65 +++++++++++++++++++++++++++++++++++++++++++++++------------- 1 files changed, 51 insertions(+), 14 deletions(-) diff --git a/lib/dfa.c b/lib/dfa.c index 9939d22..811c41e 100644 --- a/lib/dfa.c +++ b/lib/dfa.c @@ -486,6 +486,7 @@ struct dfa bool fast; /* The DFA is fast. */ token utf8_anychar_classes[9]; /* To lower ANYCHAR in UTF-8 locales. */ mbstate_t mbs; /* Multibyte conversion state. */ + bool epsilon; /* The following are valid only if MB_CUR_MAX > 1. */ @@ -1613,13 +1614,21 @@ addtok_mb (struct dfa *dfa, token t, char mbprop) dfa->parse.depth--; break; - case BACKREF: - dfa->fast = false; + case BEGLINE: + case ENDLINE: + case BEGWORD: + case ENDWORD: + case LIMWORD: + case NOTLIMWORD: + case EMPTY: + dfa->epsilon = true; FALLTHROUGH; + default: - dfa->nleaves++; - FALLTHROUGH; - case EMPTY: + if (t == BACKREF) + dfa->fast = false; + if (t != EMPTY) + dfa->nleaves++; dfa->parse.depth++; break; } @@ -2295,14 +2304,15 @@ state_index (struct dfa *d, position_set const *s, int context) constraint. Repeat exhaustively until no funny positions are left. S->elems must be large enough to hold the result. */ static void -epsclosure (struct dfa const *d) +epsclosure (struct dfa const *d, position_set *backword) { position_set tmp; alloc_position_set (&tmp, d->nleaves); for (idx_t i = 0; i < d->tindex; i++) if (d->follows[i].nelem > 0 && d->tokens[i] >= NOTCHAR && d->tokens[i] != BACKREF && d->tokens[i] != ANYCHAR - && d->tokens[i] != MBCSET && d->tokens[i] < CSET) + && d->tokens[i] != MBCSET && d->tokens[i] < CSET + && d->tokens[i] != BEG) { unsigned int constraint; switch (d->tokens[i]) @@ -2325,16 +2335,21 @@ epsclosure (struct dfa const *d) case NOTLIMWORD: constraint = NOTLIMWORD_CONSTRAINT; break; - default: + case EMPTY: constraint = NO_CONSTRAINT; break; + default: + abort (); } delete (i, &d->follows[i]); - for (idx_t j = 0; j < d->tindex; j++) - if (i != j && d->follows[j].nelem > 0) - replace (&d->follows[j], i, &d->follows[i], constraint, &tmp); + for (idx_t j = 0; j < backword[i].nelem; j++) + replace (&d->follows[backword[i].elems[j].index], i, &d->follows[i], + constraint, &tmp); + for (idx_t j = 0; j < d->follows[i].nelem; j++) + replace (&backword[d->follows[i].elems[j].index], i, &backword[i], + NO_CONSTRAINT, &tmp); } free (tmp.elems); } @@ -2641,6 +2656,7 @@ dfaanalyze (struct dfa *d, bool searchflag) /* Firstpos and lastpos elements. */ position *firstpos = posalloc; position *lastpos = firstpos + d->nleaves; + position_set *backword = NULL; position pos; position_set tmp; @@ -2673,6 +2689,9 @@ dfaanalyze (struct dfa *d, bool searchflag) alloc_position_set (&merged, d->nleaves); d->follows = xcalloc (d->tindex, sizeof *d->follows); + if (d->epsilon) + backword = xcalloc (d->tindex, sizeof *backword); + for (idx_t i = 0; i < d->tindex; i++) { switch (d->tokens[i]) @@ -2710,6 +2729,17 @@ dfaanalyze (struct dfa *d, bool searchflag) case CAT: /* Every element in the firstpos of the second argument is in the follow of every element in the lastpos of the first argument. */ + if (d->epsilon) + { + tmp.nelem = stk[-2].nlastpos; + tmp.elems = lastpos - stk[-1].nlastpos - stk[-2].nlastpos; + position *p = firstpos - stk[-1].nfirstpos; + for (idx_t j = 0; j < stk[-1].nfirstpos; j++) + { + merge (&tmp, &backword[p[j].index], &merged); + copy (&merged, &backword[p[j].index]); + } + } { tmp.nelem = stk[-1].nfirstpos; tmp.elems = firstpos - stk[-1].nfirstpos; @@ -2799,9 +2829,15 @@ dfaanalyze (struct dfa *d, bool searchflag) #endif } - /* For each follow set that is the follow set of a real position, replace - it with its epsilon closure. */ - epsclosure (d); + if (d->epsilon) + { + /* For each follow set that is the follow set of a real position, + replace it with its epsilon closure. */ + epsclosure (d, backword); + + for (idx_t i = 0; i < d->tindex; i++) + free (backword[i].elems); + } dfaoptimize (d); @@ -2863,6 +2899,7 @@ dfaanalyze (struct dfa *d, bool searchflag) d->min_trcount++; d->trcount = 0; + free (backword); free (posalloc); free (stkalloc); free (merged.elems); -- 1.7.1