[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
bug#24603: [RFC 17/18] Optimise character class matching in regexes
From: |
Michal Nazarewicz |
Subject: |
bug#24603: [RFC 17/18] Optimise character class matching in regexes |
Date: |
Tue, 4 Oct 2016 03:10:40 +0200 |
Use lookup tables defined in src/character.h to bundle checks together
if possible. For example, ‘[[:lower:][:digit:]]’ would perform an
equivalence of ‘lowercasep(ch) || numericp(ch)’ check. Now, such checks
are done all at once with at most one Unicode general category lookup.
Similarly, do at most one syntax table lookup by unrolling macros
testing character properties.
* src/regex.c (execute_charset): Use category_char_bits and call SYNTAX
at most once.
* test/src/regex-tests.el (regex-tests--letter-character-classes): New
test case for various character classes relating to letters etc.
---
src/regex.c | 86 +++++++++++++++++++++++++++++++------------------
test/src/regex-tests.el | 45 ++++++++++++++++++++++++++
2 files changed, 100 insertions(+), 31 deletions(-)
diff --git a/src/regex.c b/src/regex.c
index 02da1fb..bfd04a1 100644
--- a/src/regex.c
+++ b/src/regex.c
@@ -1789,16 +1789,26 @@ struct range_table_work_area
/* Bits used to implement the multibyte-part of the various character classes
such as [:alnum:] in a charset's range table. The code currently assumes
that only the low 16 bits are used. */
-#define BIT_WORD 0x1
-#define BIT_LOWER 0x2
-#define BIT_PUNCT 0x4
-#define BIT_SPACE 0x8
-#define BIT_UPPER 0x10
-#define BIT_MULTIBYTE 0x20
-#define BIT_ALPHA 0x40
-#define BIT_ALNUM 0x80
-#define BIT_GRAPH 0x100
-#define BIT_PRINT 0x200
+#ifdef emacs
+# define BIT_ALNUM CHAR_BIT_ALNUM
+# define BIT_ALPHA CHAR_BIT_ALPHA
+# define BIT_UPPER CHAR_BIT_UPPER
+# define BIT_LOWER CHAR_BIT_LOWER
+# define BIT_GRAPH CHAR_BIT_GRAPH
+# define BIT_PRINT CHAR_BIT_PRINT
+#else
+# define BIT_ALNUM (1 << 0)
+# define BIT_ALPHA (1 << 1)
+# define BIT_UPPER (1 << 2)
+# define BIT_LOWER (1 << 3)
+# define BIT_GRAPH (1 << 4)
+# define BIT_PRINT (1 << 5)
+#endif
+#define BIT_WORD (BIT_PRINT << 1)
+#define BIT_PUNCT (BIT_PRINT << 2)
+#define BIT_SPACE (BIT_PRINT << 3)
+#define BIT_MULTIBYTE (BIT_PRINT << 4)
+
/* Set the bit for character C in a list. */
@@ -1988,9 +1998,6 @@ re_wctype_parse (const unsigned char **strp, unsigned
limit)
2 [:print:]
2 [:cntrl:]
1 [:ff:]
-
- If you update this list, consider also updating chain of or’ed conditions
- in execute_charset function.
*/
switch (it - beg) {
@@ -4657,28 +4664,45 @@ execute_charset (const_re_char **pp, unsigned c,
unsigned corig, bool unibyte)
else if (rtp)
{
int class_bits = CHARSET_RANGE_TABLE_BITS (p);
+ int bits;
re_wchar_t range_start, range_end;
- /* Sort tests by the most commonly used classes with some adjustment to which
- tests are easiest to perform. Take a look at comment in re_wctype_parse
- for table with frequencies of character class names. */
-
- if ((class_bits & BIT_MULTIBYTE) ||
- (class_bits & BIT_ALNUM && ISALNUM (c)) ||
- (class_bits & BIT_ALPHA && ISALPHA (c)) ||
- (class_bits & BIT_SPACE && ISSPACE (c)) ||
- (class_bits & BIT_WORD && ISWORD (c)) ||
- ((class_bits & BIT_UPPER) &&
- (ISUPPER (c) || (corig != c &&
- c == downcase (corig) && ISLOWER (c)))) ||
- ((class_bits & BIT_LOWER) &&
- (ISLOWER (c) || (corig != c &&
- c == upcase (corig) && ISUPPER(c)))) ||
- (class_bits & BIT_PUNCT && ISPUNCT (c)) ||
- (class_bits & BIT_GRAPH && ISGRAPH (c)) ||
- (class_bits & BIT_PRINT && ISPRINT (c)))
+ if (class_bits & BIT_MULTIBYTE)
return !not;
+ /* If we are at this point, the character is not an ASCII or single byte
+ character. This means that whenever ISFOO macros have special case for
+ IS_REAL_ASCII (c), we can ignore that. */
+
+ bits = class_bits & (BIT_ALNUM | BIT_ALPHA | BIT_UPPER | BIT_LOWER |
+ BIT_GRAPH | BIT_PRINT);
+ if (bits)
+ {
+ int char_bits = category_char_bits[char_unicode_category (c)];
+ if (bits & char_bits)
+ return !not;
+
+ /* Handle case folding. */
+ if (corig != c)
+ {
+ if ((bits & BIT_UPPER) && (char_bits & BIT_LOWER) &&
+ c == downcase (corig))
+ return !not;
+ if ((bits & BIT_LOWER) && (char_bits & BIT_UPPER) &&
+ c == upcase (corig))
+ return !not;
+ }
+ }
+
+ if (class_bits & (BIT_SPACE | BIT_WORD | BIT_PUNCT))
+ {
+ enum syntaxcode s = SYNTAX (c);
+ if (((class_bits & BIT_SPACE) && s == Swhitespace) ||
+ ((class_bits & BIT_WORD ) && s == Sword) ||
+ ((class_bits & BIT_PUNCT) && s != Sword))
+ return !not;
+ }
+
for (p = *pp; rtp < p; rtp += 2 * 3)
{
EXTRACT_CHARACTER (range_start, rtp);
diff --git a/test/src/regex-tests.el b/test/src/regex-tests.el
index fc50344..7617823 100644
--- a/test/src/regex-tests.el
+++ b/test/src/regex-tests.el
@@ -98,6 +98,51 @@ regex--test-cc
(eval `(ert-deftest ,name () ,doc ,(cons 'regex--test-cc test)) t)))
+(ert-deftest regex-tests--letter-character-classes ()
+ "Test character classes against various letters types."
+ (should-not
+ (cl-remove-if
+ 'not
+ (let ((check-ccs (lambda (ch fold)
+ (mapconcat
+ (lambda (str) str)
+ (let ((case-fold-search fold))
+ (cl-remove-if-not
+ (lambda (cc)
+ (string-match-p (concat "[[:" cc ":]]")
+ (string ch)))
+ '("alnum" "alpha" "upper" "lower")))
+ " "))))
+ (mapcar
+ (lambda (entry)
+ (let ((ch (car entry)) (expected (cdr entry)))
+ (setq entry
+ (format "%s | %s | case-fold: %s"
+ (get-char-code-property ch 'general-category)
+ (funcall check-ccs ch nil) (funcall check-ccs ch t)))
+ (unless (string-equal expected entry)
+ (format "\n%c expected: %s\nU+%06X but got: %s"
+ ch expected ch entry))))
+ '((?A . "Lu | alnum alpha upper | case-fold: alnum alpha upper lower")
+ (?ẞ . "Lu | alnum alpha upper | case-fold: alnum alpha upper lower")
+ (?DZ . "Lu | alnum alpha upper | case-fold: alnum alpha upper lower")
+ (?a . "Ll | alnum alpha lower | case-fold: alnum alpha upper lower")
+ ;; FIXME: Should match upper when case-fold case
+ ;; (?ł . "Ll | alnum alpha lower | case-fold: alnum alpha upper
lower")
+ ;; (?ß . "Ll | alnum alpha lower | case-fold: alnum alpha upper
lower")
+ ;; (?fi . "Ll | alnum alpha lower | case-fold: alnum alpha upper
lower")
+ ;; (?ɕ . "Ll | alnum alpha lower | case-fold: alnum alpha upper
lower")
+ ;; (?dz . "Ll | alnum alpha lower | case-fold: alnum alpha upper
lower")
+ (?ł . "Ll | alnum alpha lower | case-fold: alnum alpha lower")
+ (?ß . "Ll | alnum alpha lower | case-fold: alnum alpha lower")
+ (?fi . "Ll | alnum alpha lower | case-fold: alnum alpha lower")
+ (?ɕ . "Ll | alnum alpha lower | case-fold: alnum alpha lower")
+ (?dz . "Ll | alnum alpha lower | case-fold: alnum alpha lower")
+ (?Dz . "Lt | alnum alpha | case-fold: alnum alpha upper lower")
+ (?ʰ . "Lm | alnum alpha | case-fold: alnum alpha")
+ (?º . "Lo | alnum alpha | case-fold: alnum alpha")))))))
+
+
(defmacro regex-tests-generic-line (comment-char test-file whitelist &rest
body)
"Reads a line of the test file TEST-FILE, skipping
comments (defined by COMMENT-CHAR), and evaluates the tests in
--
2.8.0.rc3.226.g39d4020
- bug#24603: [RFC 02/18] Generate upcase and downcase tables from Unicode data, (continued)
- bug#24603: [RFC 02/18] Generate upcase and downcase tables from Unicode data, Michal Nazarewicz, 2016/10/03
- bug#24603: [RFC 02/18] Generate upcase and downcase tables from Unicode data, Eli Zaretskii, 2016/10/04
- bug#24603: [RFC 02/18] Generate upcase and downcase tables from Unicode data, Michal Nazarewicz, 2016/10/04
- bug#24603: [RFC 02/18] Generate upcase and downcase tables from Unicode data, Eli Zaretskii, 2016/10/04
- bug#24603: [RFC 02/18] Generate upcase and downcase tables from Unicode data, Michal Nazarewicz, 2016/10/04
- bug#24603: [RFC 02/18] Generate upcase and downcase tables from Unicode data, Eli Zaretskii, 2016/10/04
- bug#24603: [RFC 02/18] Generate upcase and downcase tables from Unicode data, Eli Zaretskii, 2016/10/04
- bug#24603: [RFC 02/18] Generate upcase and downcase tables from Unicode data, Michal Nazarewicz, 2016/10/06
- bug#24603: [RFC 02/18] Generate upcase and downcase tables from Unicode data, Eli Zaretskii, 2016/10/07
bug#24603: [RFC 18/18] Fix case-fold-search character class matching, Michal Nazarewicz, 2016/10/03
bug#24603: [RFC 17/18] Optimise character class matching in regexes,
Michal Nazarewicz <=
bug#24603: [RFC 10/18] Implement Turkic dotless and dotted i handling when casing strings, Michal Nazarewicz, 2016/10/03
bug#24603: [RFC 08/18] Support casing characters which map into multiple code points, Michal Nazarewicz, 2016/10/03
bug#24603: [PATCH 0/3] Case table updates, Michal Nazarewicz, 2016/10/17