[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[PATCH v2 06/10] uniname: Make codepoint transformation more flexible
From: |
Daiki Ueno |
Subject: |
[PATCH v2 06/10] uniname: Make codepoint transformation more flexible |
Date: |
Thu, 23 Oct 2014 17:01:37 +0900 |
The uniname module tries to reduce memory usage by grouping Unicode
codepoints so that each codepoint fit into a 16-bit integer (4-bit tag
plus 12-bit index). However, this transformation cannot represent all
the characters in Unicode 7.0, where > 16 groups are needed.
The attached patch tries to remove the limitation. It basically
switches the hard-coded transformation rule into a binary search on a
table of contiguous codepoint ranges.
* lib/uniname/gen-uninames.lisp (unicode-char): Rename CODE member
to INDEX, as it no longer represents a code-point.
(range): New struct.
(main): Switch to intervals list from a bit-pattern based
classification.
---
lib/uniname/gen-uninames.lisp | 86 +++++++++---------
lib/uniname/uniname.c | 203 +++++++++++++++++++++++++-----------------
2 files changed, 164 insertions(+), 125 deletions(-)
diff --git a/lib/uniname/gen-uninames.lisp b/lib/uniname/gen-uninames.lisp
index d08e93f..e7de0a1 100755
--- a/lib/uniname/gen-uninames.lisp
+++ b/lib/uniname/gen-uninames.lisp
@@ -6,12 +6,18 @@
(defparameter add-comments nil)
(defstruct unicode-char
- (code nil :type integer)
+ (index nil :type integer)
(name nil :type string)
word-indices
word-indices-index
)
+(defstruct range
+ (index nil :type integer)
+ (start-code nil :type integer)
+ (end-code nil :type integer)
+)
+
(defstruct word-list
(hashed nil :type hash-table)
(sorted nil :type list)
@@ -22,7 +28,10 @@
(defun main (inputfile outputfile)
(declare (type string inputfile outputfile))
#+UNICODE (setq *default-file-encoding* charset:utf-8)
- (let ((all-chars '()))
+ (let ((all-chars '())
+ (all-ranges '())
+ (name-index 0)
+ range)
;; Read all characters and names from the input file.
(with-open-file (istream inputfile :direction :input)
(loop
@@ -41,40 +50,26 @@
; specially as well.
(unless (or (<= #xF900 code #xFA2D) (<= #xFA30 code #xFA6A)
(<= #xFA70 code #xFAD9) (<= #x2F800 code #x2FA1D))
- ; Transform the code so that it fits in 16 bits. In
- ; Unicode 5.1 the following ranges are used.
- ; 0x00000..0x04DFF >>12= 0x00..0x04 -> 0x0..0x4
- ; 0x0A000..0x0AAFF >>12= 0x0A -> 0x5
- ; 0x0F900..0x0FFFF >>12= 0x0F -> 0x6
- ; 0x10000..0x10A58 >>12= 0x10 -> 0x7
- ; 0x12000..0x12473 >>12= 0x12 -> 0x8
- ; 0x1D000..0x1D7FF >>12= 0x1D -> 0x9
- ; 0x1F000..0x1F093 >>12= 0x1F -> 0xA
- ; 0x2F800..0x2FAFF >>12= 0x2F -> 0xB
- ; 0xE0000..0xE00FF >>12= 0xE0 -> 0xC
- (flet ((transform (x)
- (dpb
- (case (ash x -12)
- ((#x00 #x01 #x02 #x03 #x04) (ash x -12))
- (#x0A 5)
- (#x0F 6)
- (#x10 7)
- (#x12 8)
- (#x1D 9)
- (#x1F #xA)
- (#x2F #xB)
- (#xE0 #xC)
- (t (error "Update the transform function for
0x~5,'0X" x))
- )
- (byte 8 12)
- x
- )) )
- (push (make-unicode-char :code (transform code)
- :name name-string)
- all-chars
- ) ) ) ) )
+ (push (make-unicode-char :index name-index
+ :name name-string)
+ all-chars)
+ ;; Update the contiguous range, or start a new range.
+ (if (and range (= (1+ (range-end-code range)) code))
+ (setf (range-end-code range) code)
+ (progn
+ (when range
+ (push range all-ranges))
+ (setq range (make-range :index name-index
+ :start-code code
+ :end-code code))))
+ (incf name-index)
+ (setq last-code code)
+ ) ) )
) ) ) )
(setq all-chars (nreverse all-chars))
+ (if range
+ (push range all-ranges))
+ (setq all-ranges (nreverse all-ranges))
;; Split into words.
(let ((words-by-length (make-array 0 :adjustable t)))
(dolist (name (list* "HANGUL SYLLABLE" "CJK COMPATIBILITY" (mapcar
#'unicode-char-name all-chars)))
@@ -257,14 +252,14 @@
(incf i (length (unicode-char-word-indices uc)))
) )
(format ostream "};~%")
- (format ostream "static const struct { uint16_t code; uint32_t
name:24; }~%")
+ (format ostream "static const struct { uint16_t index; uint32_t
name:24; }~%")
(format ostream "#if __GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__
>= 7)~%__attribute__((__packed__))~%#endif~%")
- (format ostream "unicode_name_to_code[~D] = {~%"
+ (format ostream "unicode_name_to_index[~D] = {~%"
(length all-chars)
)
(dolist (uc all-chars)
(format ostream " { 0x~4,'0X, ~D },"
- (unicode-char-code uc)
+ (unicode-char-index uc)
(unicode-char-word-indices-index uc)
)
(when add-comments
@@ -273,14 +268,14 @@
(format ostream "~%")
)
(format ostream "};~%")
- (format ostream "static const struct { uint16_t code; uint32_t
name:24; }~%")
+ (format ostream "static const struct { uint16_t index; uint32_t
name:24; }~%")
(format ostream "#if __GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__
>= 7)~%__attribute__((__packed__))~%#endif~%")
- (format ostream "unicode_code_to_name[~D] = {~%"
+ (format ostream "unicode_index_to_name[~D] = {~%"
(length all-chars)
)
- (dolist (uc (sort (copy-list all-chars) #'< :key #'unicode-char-code))
+ (dolist (uc (sort (copy-list all-chars) #'< :key #'unicode-char-index))
(format ostream " { 0x~4,'0X, ~D },"
- (unicode-char-code uc)
+ (unicode-char-index uc)
(unicode-char-word-indices-index uc)
)
(when add-comments
@@ -295,6 +290,15 @@
(format ostream "#define UNICODE_CHARNAME_MAX_WORDS ~D~%"
(reduce #'max (mapcar (lambda (uc) (length
(unicode-char-word-indices uc))) all-chars))
)
+ (format ostream "static const struct { uint16_t index; uint32_t gap;
uint16_t length; } unicode_ranges[~D] = {~%"
+ (length all-ranges))
+ (dolist (range all-ranges)
+ (format ostream " { ~D, ~D, ~D },~%"
+ (range-index range)
+ (- (range-start-code range) (range-index range))
+ (1+ (- (range-end-code range) (range-start-code range))))
+ )
+ (format ostream "};~%")
)
) ) )
diff --git a/lib/uniname/uniname.c b/lib/uniname/uniname.c
index 8f4b32d..276a1cb 100644
--- a/lib/uniname/uniname.c
+++ b/lib/uniname/uniname.c
@@ -45,10 +45,11 @@
#define UNICODE_CHARNAME_WORD_CJK 417
#define UNICODE_CHARNAME_WORD_COMPATIBILITY 6107
static const uint16_t unicode_names[68940] = ...;
- static const struct { uint16_t code; uint32_t name:24; }
unicode_name_to_code[16626] = ...;
- static const struct { uint16_t code; uint32_t name:24; }
unicode_code_to_name[16626] = ...;
+ static const struct { uint16_t index; uint32_t name:24; }
unicode_name_to_index[16626] = ...;
+ static const struct { uint16_t index; uint32_t name:24; }
unicode_index_to_name[16626] = ...;
#define UNICODE_CHARNAME_MAX_LENGTH 83
#define UNICODE_CHARNAME_MAX_WORDS 13
+ static const struct { uint32_t index; uint32_t gap; uint16_t length; }
unicode_ranges[401] = ...;
*/
/* Returns the word with a given index. */
@@ -127,6 +128,82 @@ unicode_name_word_lookup (const char *word, unsigned int
length)
return -1;
}
+#define UNINAME_INVALID_INDEX UINT16_MAX
+
+/* Looks up the internal index of a Unicode character. */
+static uint16_t
+unicode_code_to_index (ucs4_t c)
+{
+ /* Binary search in unicode_ranges. */
+ unsigned int i1 = 0;
+ unsigned int i2 = SIZEOF (unicode_ranges);
+
+ for (;;)
+ {
+ unsigned int i = (i1 + i2) >> 1;
+ ucs4_t start_code =
+ unicode_ranges[i].index + unicode_ranges[i].gap;
+ ucs4_t end_code =
+ start_code + unicode_ranges[i].length - 1;
+
+ if (start_code <= c && c <= end_code)
+ return c - unicode_ranges[i].gap;
+
+ if (end_code < c)
+ {
+ if (i1 == i)
+ break;
+ /* Note here: i1 < i < i2. */
+ i1 = i;
+ }
+ else if (c < start_code)
+ {
+ if (i2 == i)
+ break;
+ /* Note here: i1 <= i < i2. */
+ i2 = i;
+ }
+ }
+ return UNINAME_INVALID_INDEX;
+}
+
+/* Looks up the codepoint of a Unicode character, from the given
+ internal index. */
+static ucs4_t
+unicode_index_to_code (uint16_t index)
+{
+ /* Binary search in unicode_ranges. */
+ unsigned int i1 = 0;
+ unsigned int i2 = SIZEOF (unicode_ranges);
+
+ for (;;)
+ {
+ unsigned int i = (i1 + i2) >> 1;
+ uint16_t start_index = unicode_ranges[i].index;
+ uint16_t end_index = start_index + unicode_ranges[i].length - 1;
+
+ if (start_index <= index && index <= end_index)
+ return index + unicode_ranges[i].gap;
+
+ if (end_index < index)
+ {
+ if (i1 == i)
+ break;
+ /* Note here: i1 < i < i2. */
+ i1 = i;
+ }
+ else if (index < start_index)
+ {
+ if (i2 == i)
+ break;
+ /* Note here: i1 <= i < i2. */
+ i2 = i;
+ }
+ }
+ return UNINAME_INVALID;
+}
+
+
/* Auxiliary tables for Hangul syllable names, see the Unicode 3.0 book,
sections 3.11 and 4.4. */
static const char jamo_initial_short_name[19][3] =
@@ -203,78 +280,47 @@ unicode_character_name (ucs4_t c, char *buf)
}
else
{
- const uint16_t *words;
+ uint16_t index = unicode_code_to_index (c);
+ const uint16_t *words = NULL;
- /* Transform the code so that it fits in 16 bits. */
- switch (c >> 12)
+ if (index != UNINAME_INVALID_INDEX)
{
- case 0x00: case 0x01: case 0x02: case 0x03: case 0x04:
- break;
- case 0x0A:
- c -= 0x05000;
- break;
- case 0x0F:
- c -= 0x09000;
- break;
- case 0x10:
- c -= 0x09000;
- break;
- case 0x12:
- c -= 0x0A000;
- break;
- case 0x1D:
- c -= 0x14000;
- break;
- case 0x1F:
- c -= 0x15000;
- break;
- case 0x2F:
- c -= 0x24000;
- break;
- case 0xE0:
- c -= 0xD4000;
- break;
- default:
- return NULL;
+ /* Binary search in unicode_code_to_name. */
+ unsigned int i1 = 0;
+ unsigned int i2 = SIZEOF (unicode_index_to_name);
+ for (;;)
+ {
+ unsigned int i = (i1 + i2) >> 1;
+ if (unicode_index_to_name[i].index == index)
+ {
+ words = &unicode_names[unicode_index_to_name[i].name];
+ break;
+ }
+ else if (unicode_index_to_name[i].index < index)
+ {
+ if (i1 == i)
+ {
+ words = NULL;
+ break;
+ }
+ /* Note here: i1 < i < i2. */
+ i1 = i;
+ }
+ else if (unicode_index_to_name[i].index > index)
+ {
+ if (i2 == i)
+ {
+ words = NULL;
+ break;
+ }
+ /* Note here: i1 <= i < i2. */
+ i2 = i;
+ }
+ }
}
-
- {
- /* Binary search in unicode_code_to_name. */
- unsigned int i1 = 0;
- unsigned int i2 = SIZEOF (unicode_code_to_name);
- for (;;)
- {
- unsigned int i = (i1 + i2) >> 1;
- if (unicode_code_to_name[i].code == c)
- {
- words = &unicode_names[unicode_code_to_name[i].name];
- break;
- }
- else if (unicode_code_to_name[i].code < c)
- {
- if (i1 == i)
- {
- words = NULL;
- break;
- }
- /* Note here: i1 < i < i2. */
- i1 = i;
- }
- else if (unicode_code_to_name[i].code > c)
- {
- if (i2 == i)
- {
- words = NULL;
- break;
- }
- /* Note here: i1 <= i < i2. */
- i2 = i;
- }
- }
- }
if (words != NULL)
{
- /* Found it in unicode_code_to_name. Now concatenate the words. */
+ /* Found it in unicode_index_to_name. Now concatenate the words. */
/* buf needs to have at least UNICODE_CHARNAME_MAX_LENGTH bytes. */
char *ptr = buf;
for (;;)
@@ -463,15 +509,15 @@ unicode_name_character (const char *name)
for (; --i >= 0; )
words[i] = 2 * words[i] + 1;
}
- /* Binary search in unicode_name_to_code. */
+ /* Binary search in unicode_name_to_index. */
{
unsigned int i1 = 0;
- unsigned int i2 = SIZEOF (unicode_name_to_code);
+ unsigned int i2 = SIZEOF (unicode_name_to_index);
for (;;)
{
unsigned int i = (i1 + i2) >> 1;
const uint16_t *w = words;
- const uint16_t *p =
&unicode_names[unicode_name_to_code[i].name];
+ const uint16_t *p =
&unicode_names[unicode_name_to_index[i].name];
unsigned int n = words_length;
for (;;)
{
@@ -493,18 +539,7 @@ unicode_name_character (const char *name)
}
p++; w++; n--;
if (n == 0)
- {
- unsigned int c = unicode_name_to_code[i].code;
-
- /* Undo the transformation to 16-bit space. */
- static const unsigned int offset[13] =
- {
- 0x00000, 0x00000, 0x00000, 0x00000, 0x00000,
- 0x05000, 0x09000, 0x09000, 0x0A000, 0x14000,
- 0x15000, 0x24000, 0xD4000
- };
- return c + offset[c >> 12];
- }
+ return unicode_index_to_code
(unicode_name_to_index[i].index);
}
}
}
--
1.9.3
- [PATCH v2 00/10] Update libunistring-related modules to Unicode 7.0.0, Daiki Ueno, 2014/10/23
- [PATCH v2 01/10] gen-uni-tables: Minor style fixes, Daiki Ueno, 2014/10/23
- [PATCH v2 04/10] uniwbrk: Ignore Extended/Format at the beginning of the line, Daiki Ueno, 2014/10/23
- [PATCH v2 02/10] gen-uni-tables: Check out-of-range values added to 3-level tables, Daiki Ueno, 2014/10/23
- [PATCH v2 05/10] uniwbrk/u32-wordbreaks-tests: Test using WordBreakTest.txt from UCD, Daiki Ueno, 2014/10/23
- [PATCH v2 06/10] uniname: Make codepoint transformation more flexible,
Daiki Ueno <=
- [PATCH v2 03/10] unictype/joininggroup-of: Switch to 3-level table, Daiki Ueno, 2014/10/23
- [PATCH v2 07/10] Update to Unicode 6.1.0, Daiki Ueno, 2014/10/23
- [PATCH v2 08/10] Update to Unicode 6.2.0, Daiki Ueno, 2014/10/23
- [PATCH v2 09/10] Update to Unicode 6.3.0, Daiki Ueno, 2014/10/23
- Re: [PATCH v2 00/10] Update libunistring-related modules to Unicode 7.0.0, Pádraig Brady, 2014/10/23