[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[bug-libunistring] [PATCH 2/8] unictype/joininggroup-of: Switch to 3-lev
From: |
Daiki Ueno |
Subject: |
[bug-libunistring] [PATCH 2/8] unictype/joininggroup-of: Switch to 3-level table |
Date: |
Fri, 10 Oct 2014 22:59:47 +0900 |
* lib/gen-uni-tables.c (output_joining_group): Switch to
3-level table to accommodate a joining group defined with higher
code-point value. Since there are only 88 groups defined in
Unicode 7.0.0, use 7-bit packed format for level3 entries.
* lib/unictype/joininggroup_of.c (uc_joining_group): Adjust to use
3-level table.
* lib/unictype/joininggroup_of.h: Regenerate.
---
lib/gen-uni-tables.c | 155 ++++++++++++++++++++++++++++++-----------
lib/unictype/joininggroup_of.c | 29 ++++++--
2 files changed, 139 insertions(+), 45 deletions(-)
diff --git a/lib/gen-uni-tables.c b/lib/gen-uni-tables.c
index 56fe26c..3747875 100644
--- a/lib/gen-uni-tables.c
+++ b/lib/gen-uni-tables.c
@@ -3987,7 +3987,7 @@ output_joining_type (const char *filename, const char
*version)
}
fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
- fprintf (stream, "/* Arabic joining type of Unicode characters. */\n");
+ fprintf (stream, "/* Arabic joining group of Unicode characters. */\n");
fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode
%s. */\n",
version);
@@ -4213,11 +4213,22 @@ output_joining_group_test (const char *filename, const
char *version)
}
}
+/* Construction of sparse 3-level tables. */
+#define TABLE joining_group_table
+#define ELEMENT uint8_t
+#define DEFAULT UC_JOINING_GROUP_NONE
+#define xmalloc malloc
+#define xrealloc realloc
+#include "3level.h"
+
static void
output_joining_group (const char *filename, const char *version)
{
FILE *stream;
- unsigned int ch_min, ch_max, ch, i;
+ unsigned int ch, i;
+ struct joining_group_table t;
+ unsigned int level1_offset, level2_offset, level3_offset;
+ uint16_t *level3_packed;
stream = fopen (filename, "w");
if (stream == NULL)
@@ -4231,51 +4242,115 @@ output_joining_group (const char *filename, const char
*version)
fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode
%s. */\n",
version);
- ch_min = 0x10FFFF;
+ t.p = 7;
+ t.q = 9;
+ joining_group_table_init (&t);
+
for (ch = 0; ch < 0x110000; ch++)
- if (unicode_joining_group[ch] != UC_JOINING_GROUP_NONE)
- {
- ch_min = ch;
- break;
- }
+ {
+ uint8_t value = unicode_joining_group[ch];
- ch_max = 0;
- for (ch = 0x10FFFF; ch > 0; ch--)
- if (unicode_joining_group[ch] != UC_JOINING_GROUP_NONE)
- {
- ch_max = ch;
- break;
- }
+ if (value > 0x7f)
+ abort ();
- if (!(ch_min <= ch_max))
- abort ();
+ joining_group_table_add (&t, ch, value);
+ }
- /* If the interval [ch_min, ch_max] is too large, we should better use a
- 3-level table. */
- if (!(ch_max - ch_min < 0x200))
- abort ();
+ joining_group_table_finalize (&t);
- fprintf (stream, "#define joining_group_header_0 0x%x\n", ch_min);
- fprintf (stream, "static const unsigned char u_joining_group[0x%x - 0x%x]
=\n",
- ch_max + 1, ch_min);
- fprintf (stream, "{");
- for (i = 0; i <= ch_max - ch_min; i++)
- {
- const char *s;
+ /* Offsets in t.result, in memory of this process. */
+ level1_offset =
+ 5 * sizeof (uint32_t);
+ level2_offset =
+ 5 * sizeof (uint32_t)
+ + t.level1_size * sizeof (uint32_t);
+ level3_offset =
+ 5 * sizeof (uint32_t)
+ + t.level1_size * sizeof (uint32_t)
+ + (t.level2_size << t.q) * sizeof (uint32_t);
- ch = ch_min + i;
- if ((i % 2) == 0)
- fprintf (stream, "\n ");
- s = joining_group_as_c_identifier (unicode_joining_group[ch]);
- fprintf (stream, " %s", s);
- if (i+1 <= ch_max - ch_min)
- {
- fprintf (stream, ",");
- if (((i+1) % 2) != 0)
- fprintf (stream, "%*s", 38 - (int) strlen (s), "");
- }
+ for (i = 0; i < 5; i++)
+ fprintf (stream, "#define joining_group_header_%d %d\n", i,
+ ((uint32_t *) t.result)[i]);
+ fprintf (stream, "static const\n");
+ fprintf (stream, "struct\n");
+ fprintf (stream, " {\n");
+ fprintf (stream, " int level1[%zu];\n", t.level1_size);
+ fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
+ fprintf (stream, " unsigned short level3[%zu * %d + 1];\n", t.level3_size,
+ (1 << t.p) * 7 / 16);
+ fprintf (stream, " }\n");
+ fprintf (stream, "u_joining_group =\n");
+ fprintf (stream, "{\n");
+ fprintf (stream, " {");
+ if (t.level1_size > 8)
+ fprintf (stream, "\n ");
+ for (i = 0; i < t.level1_size; i++)
+ {
+ uint32_t offset;
+ if (i > 0 && (i % 8) == 0)
+ fprintf (stream, "\n ");
+ offset = ((uint32_t *) (t.result + level1_offset))[i];
+ if (offset == 0)
+ fprintf (stream, " %5d", -1);
+ else
+ fprintf (stream, " %5zu",
+ (offset - level2_offset) / sizeof (uint32_t));
+ if (i+1 < t.level1_size)
+ fprintf (stream, ",");
}
- fprintf (stream, "\n");
+ if (t.level1_size > 8)
+ fprintf (stream, "\n ");
+ fprintf (stream, " },\n");
+ fprintf (stream, " {");
+ if (t.level2_size << t.q > 8)
+ fprintf (stream, "\n ");
+ for (i = 0; i < t.level2_size << t.q; i++)
+ {
+ uint32_t offset;
+ if (i > 0 && (i % 8) == 0)
+ fprintf (stream, "\n ");
+ offset = ((uint32_t *) (t.result + level2_offset))[i];
+ if (offset == 0)
+ fprintf (stream, " %5d", -1);
+ else
+ fprintf (stream, " %5zu",
+ (offset - level3_offset) / sizeof (uint8_t));
+ if (i+1 < t.level2_size << t.q)
+ fprintf (stream, ",");
+ }
+ if (t.level2_size << t.q > 8)
+ fprintf (stream, "\n ");
+ fprintf (stream, " },\n");
+ /* Pack the level3 array. Each entry needs 7 bits only. Use 16-bit units,
+ not 32-bit units, in order to make the lookup function easier. */
+ level3_packed =
+ (uint16_t *)
+ calloc ((t.level3_size << t.p) * 7 / 16 + 1, sizeof (uint16_t));
+ for (i = 0; i < t.level3_size << t.p; i++)
+ {
+ unsigned int j = (i * 7) / 16;
+ unsigned int k = (i * 7) % 16;
+ uint32_t value = ((unsigned char *) (t.result + level3_offset))[i];
+ value = level3_packed[j] | (level3_packed[j+1] << 16) | (value << k);
+ level3_packed[j] = value & 0xffff;
+ level3_packed[j+1] = value >> 16;
+ }
+ fprintf (stream, " {");
+ if ((t.level3_size << t.p) * 7 / 16 + 1 > 8)
+ fprintf (stream, "\n ");
+ for (i = 0; i < (t.level3_size << t.p) * 7 / 16 + 1; i++)
+ {
+ if (i > 0 && (i % 8) == 0)
+ fprintf (stream, "\n ");
+ fprintf (stream, " 0x%04x", level3_packed[i]);
+ if (i+1 < (t.level3_size << t.p) * 7 / 16 + 1)
+ fprintf (stream, ",");
+ }
+ if ((t.level3_size << t.p) * 7 / 16 + 1 > 8)
+ fprintf (stream, "\n ");
+ fprintf (stream, " }\n");
+ free (level3_packed);
fprintf (stream, "};\n");
if (ferror (stream) || fclose (stream))
diff --git a/lib/unictype/joininggroup_of.c b/lib/unictype/joininggroup_of.c
index 987af1e..c7b6846 100644
--- a/lib/unictype/joininggroup_of.c
+++ b/lib/unictype/joininggroup_of.c
@@ -20,14 +20,33 @@
/* Specification. */
#include "unictype.h"
-#include "unictype/joininggroup_of.h"
+/* Define u_joining_group table. */
+#include "joininggroup_of.h"
int
uc_joining_group (ucs4_t uc)
{
- if (uc >= joining_group_header_0
- && uc < joining_group_header_0
- + sizeof (u_joining_group) / sizeof (u_joining_group[0]))
- return u_joining_group[uc - joining_group_header_0];
+ unsigned int index1 = uc >> joining_group_header_0;
+ if (index1 < joining_group_header_1)
+ {
+ int lookup1 = u_joining_group.level1[index1];
+ if (lookup1 >= 0)
+ {
+ unsigned int index2 = (uc >> joining_group_header_2) &
joining_group_header_3;
+ int lookup2 = u_joining_group.level2[lookup1 + index2];
+ if (lookup2 >= 0)
+ {
+ unsigned int index3 = ((uc & joining_group_header_4) + lookup2)
* 7;
+ /* level3 contains 7-bit values, packed into 16-bit words. */
+ unsigned int lookup3 =
+ ((u_joining_group.level3[index3>>4]
+ | (u_joining_group.level3[(index3>>4)+1] << 16))
+ >> (index3 % 16))
+ & 0x7f;
+
+ return lookup3;
+ }
+ }
+ }
return UC_JOINING_GROUP_NONE;
}
--
2.1.1
- [bug-libunistring] [PATCH 0/8] Update libunistring-related modules to Unicode 7.0.0, Daiki Ueno, 2014/10/10
- [bug-libunistring] [PATCH 1/8] gen-uni-tables: Check out-of-range values added to 3-level tables, Daiki Ueno, 2014/10/10
- [bug-libunistring] [PATCH 2/8] unictype/joininggroup-of: Switch to 3-level table,
Daiki Ueno <=
- [bug-libunistring] [PATCH 4/8] uniwbrk/u32-wordbreaks-tests: Test using WordBreakTest.txt from UCD, Daiki Ueno, 2014/10/10
- [bug-libunistring] [PATCH 3/8] uniwbrk: Ignore Extended/Format at the beginning of the line, Daiki Ueno, 2014/10/10
- [bug-libunistring] [PATCH 5/8] Update to Unicode 6.1.0, Daiki Ueno, 2014/10/10
- [bug-libunistring] [PATCH 7/8] Update to Unicode 6.3.0, Daiki Ueno, 2014/10/10
- [bug-libunistring] [PATCH 6/8] Update to Unicode 6.2.0, Daiki Ueno, 2014/10/10
- [bug-libunistring] [PATCH 8/8] Update to Unicode 7.0.0, Daiki Ueno, 2014/10/10