From 67484a67d7d310d76a2eb80b68a8ec8eb5c6a7fc Mon Sep 17 00:00:00 2001 From: Norihiro Tanaka Date: Mon, 28 Nov 2016 22:26:07 +0900 Subject: [PATCH] dfa: avoid match middle in multibyte character * lib/dfa.c (transit_state): If fails in matching single byte characters on a state including period expression in non-UTF8 multibyte locales, skip trailing bytes. (dfa_supported): Revert previous change. --- ChangeLog | 8 ++++++++ lib/dfa.c | 8 +------- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/ChangeLog b/ChangeLog index fec4fb9..fd062ae 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,11 @@ +2016-11-27 Norihiro Tanaka + + dfa: avoid match middle in multibyte character + * lib/dfa.c (transit_state): If fails in matching single byte characters + on a state including period expression in non-UTF8 multibyte locales, + skip trailing bytes. + (dfa_supported): Revert previous change. + 2016-11-27 Jim Meyering dfa: avoid false match in non-UTF8 multibyte locales diff --git a/lib/dfa.c b/lib/dfa.c index f0ed139..673ef95 100644 --- a/lib/dfa.c +++ b/lib/dfa.c @@ -2913,7 +2913,7 @@ transit_state (struct dfa *d, state_num s, unsigned char const **pp, /* Calculate the state which can be reached from the state 's' by consuming 'mbclen' single bytes from the buffer. */ s1 = s; - for (i = 0; i < mbclen && 0 <= s; i++) + for (i = 0; i < mbclen && (i == 0 || d->min_trcount <= s); i++) s = transit_state_singlebyte (d, s, pp); *pp += mbclen - i; @@ -3272,12 +3272,6 @@ free_mbdata (struct dfa *d) static bool _GL_ATTRIBUTE_PURE dfa_supported (struct dfa const *d) { - /* Declare any non-UTF8 multibyte locale "not supported." Otherwise, a - regexp like ".*7" would mistakenly match \uC9, e.g., via this command: - (export LC_ALL=zh_CN.gb18030; printf '\uC9\n' | grep '.*7') */ - if (d->localeinfo.multibyte && !d->localeinfo.using_utf8) - return false; - size_t i; for (i = 0; i < d->tindex; i++) { -- 1.7.1