qemacs-commit
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Qemacs-commit] qemacs charset.c charsetjis.c extras.c qe.c qe....


From: Charlie Gordon
Subject: [Qemacs-commit] qemacs charset.c charsetjis.c extras.c qe.c qe....
Date: Wed, 15 Mar 2017 19:42:23 -0400 (EDT)

CVSROOT:        /sources/qemacs
Module name:    qemacs
Changes by:     Charlie Gordon <chqrlie>        17/03/15 19:42:23

Modified files:
        .              : charset.c charsetjis.c extras.c qe.c qe.h tty.c 
                         unihex.c 

Log message:
        basic: charset improvements
        - add QECharset.encode_table as a default for CharsetDecodeState.table
        - make CharsetDecodeState.table const
        - use static initializers for utf8 tables
        - add al32utf8 charset name (used in Oracle for true utf-8 encoding)
        - use direct pointers in unicode_glyph_range_index
        - initialize unicode_glyph_range_index at run time
        - rename unicode_glyph_tty_width as unicode_tty_glyph_width
        - changed unihex display to space out single width characters

CVSWeb URLs:
http://cvs.savannah.gnu.org/viewcvs/qemacs/charset.c?cvsroot=qemacs&r1=1.45&r2=1.46
http://cvs.savannah.gnu.org/viewcvs/qemacs/charsetjis.c?cvsroot=qemacs&r1=1.9&r2=1.10
http://cvs.savannah.gnu.org/viewcvs/qemacs/extras.c?cvsroot=qemacs&r1=1.55&r2=1.56
http://cvs.savannah.gnu.org/viewcvs/qemacs/qe.c?cvsroot=qemacs&r1=1.251&r2=1.252
http://cvs.savannah.gnu.org/viewcvs/qemacs/qe.h?cvsroot=qemacs&r1=1.236&r2=1.237
http://cvs.savannah.gnu.org/viewcvs/qemacs/tty.c?cvsroot=qemacs&r1=1.71&r2=1.72
http://cvs.savannah.gnu.org/viewcvs/qemacs/unihex.c?cvsroot=qemacs&r1=1.35&r2=1.36

Patches:
Index: charset.c
===================================================================
RCS file: /sources/qemacs/qemacs/charset.c,v
retrieving revision 1.45
retrieving revision 1.46
diff -u -b -r1.45 -r1.46
--- charset.c   15 Mar 2017 23:17:58 -0000      1.45
+++ charset.c   15 Mar 2017 23:42:23 -0000      1.46
@@ -51,31 +51,13 @@
     UINT_MAX, 1,              /* 26: catchall */
 };
 
-static unsigned int const unicode_glyph_range_index[16] = {
-    2 * 0,   /* 0000-0FFF */
-    2 * 0,   /* 1000-1FFF */
-    2 * 2,   /* 2000-2FFF */
-    2 * 7,   /* 3000-3FFF */
-    2 * 9,   /* 4000-4FFF */
-    2 * 11,  /* 5000-5FFF */
-    2 * 11,  /* 6000-6FFF */
-    2 * 11,  /* 7000-7FFF */
-    2 * 11,  /* 8000-8FFF */
-    2 * 11,  /* 9000-9FFF */
-    2 * 11,  /* A000-AFFF */
-    2 * 13,  /* B000-BFFF */
-    2 * 13,  /* C000-CFFF */
-    2 * 13,  /* D000-DFFF */
-    2 * 14,  /* E000-EFFF */
-    2 * 14,  /* F000-FFFF */
-};
+static const unsigned int *unicode_glyph_range_index[0x20];
 
-int unicode_glyph_tty_width(unsigned int ucs)
+int unicode_tty_glyph_width(unsigned int ucs)
 {
-    unsigned int const *ip;
-
     /* Iterative lookup with fast initial jump, no boundary test needed */
-    ip = unicode_glyph_ranges + unicode_glyph_range_index[(ucs >> 12) & 0xF];
+    /* Very efficient for BMP and SMP code-points */
+    unsigned int const *ip = unicode_glyph_range_index[(ucs >> 12) & 0x1F];
 
     while (ucs > ip[0]) {
         ip += 2;
@@ -85,28 +67,62 @@
 
 /* utf-8 specific tables */
 
-static unsigned short table_idem[256];
-static unsigned short table_utf8[256];
-static unsigned short table_none[256];
+#define REP2(x)    x, x
+#define REP4(x)    x, x, x, x
+#define REP8(x)    REP4(x), REP4(x)
+#define REP16(x)   REP4(x), REP4(x), REP4(x), REP4(x)
+#define REP32(x)   REP16(x), REP16(x)
+#define REP64(x)   REP16(x), REP16(x), REP16(x), REP16(x)
+#define REP128(x)  REP64(x), REP64(x)
+#define REP256(x)  REP64(x), REP64(x), REP64(x), REP64(x)
+
+#define RUN2(x)    (x)+0, (x)+1
+#define RUN4(x)    (x)+0, (x)+1, (x)+2, (x)+3
+#define RUN8(x)    RUN4(x), RUN4((x)+4)
+#define RUN16(x)   RUN4(x), RUN4((x)+4), RUN4((x)+8), RUN4((x)+12)
+#define RUN32(x)   RUN16(x), RUN16((x)+16)
+#define RUN64(x)   RUN16(x), RUN16((x)+16), RUN16((x)+32), RUN16((x)+48)
+#define RUN128(x)  RUN64(x), RUN64((x)+64)
+#define RUN256(x)  RUN64(x), RUN64((x)+64), RUN64((x)+128), RUN64((x)+192)
+
+static unsigned short const table_idem[256] = { RUN256(0) };
+static unsigned short const table_none[256] = { REP256(ESCAPE_CHAR) };
+
+static unsigned short const table_utf8[256] = {
+    RUN128(0),              /* [0x00...0x80] are self-encoding ASCII bytes */
+    REP64(INVALID_CHAR),    /* [0x80...0xC0] are invalid prefix bytes */
+    REP32(ESCAPE_CHAR),     /* [0xC0...0xE0] leading bytes of 2 byte sequences 
*/
+    REP16(ESCAPE_CHAR),     /* [0xE0...0xF0] leading bytes of 3 byte sequences 
*/
+    REP8(ESCAPE_CHAR),      /* [0xF0...0xF8] leading bytes of 4 byte sequences 
*/
+    REP4(ESCAPE_CHAR),      /* [0xF8...0xFC] leading bytes of 5 byte sequences 
*/
+    REP2(ESCAPE_CHAR),      /* [0xFC...0xFE] leading bytes of  byte sequences 
*/
+    INVALID_CHAR,           /* 0xFE is invalid in UTF-8 encoding */
+    INVALID_CHAR,           /* 0xFF is invalid in UTF-8 encoding */
+};
 
-unsigned char utf8_length[256];
+unsigned char const utf8_length[256] = {
+    REP128(1),  /* [0x00...0x80] are self-encoding ASCII bytes */
+    REP64(1),   /* [0x80...0xC0] are invalid prefix bytes, could use 0 */
+    REP32(2),   /* [0xC0...0xE0] leading bytes of 2 byte sequences */
+    REP16(3),   /* [0xE0...0xF0] leading bytes of 3 byte sequences */
+    REP8(4),    /* [0xF0...0xF8] leading bytes of 4 byte sequences */
+    REP4(5),    /* [0xF8...0xFC] leading bytes of 5 byte sequences */
+    REP2(6),    /* [0xFC...0xFE] leading bytes of  byte sequences */
+    1,          /* 0xFE is invalid in UTF-8 encoding */
+    1,          /* 0xFF is invalid in UTF-8 encoding */
+};
 
-static const unsigned int utf8_min_code[7] = {
+static unsigned int const utf8_min_code[7] = {
     0, 0, 0x80, 0x800, 0x10000, 0x00200000, 0x04000000,
 };
 
-static const unsigned char utf8_first_code_mask[7] = {
+static unsigned char const utf8_first_code_mask[7] = {
     0, 0, 0x1f, 0xf, 0x7, 0x3, 0x1,
 };
 
 /********************************************************/
 /* raw */
 
-static void decode_raw_init(CharsetDecodeState *s)
-{
-    s->table = table_idem;
-}
-
 static u8 *encode_raw(qe__unused__ QECharset *charset, u8 *p, int c)
 {
     if (c <= 0xff) {
@@ -121,14 +137,14 @@
     "raw",
     "binary|none",
     NULL,
-    decode_raw_init,
+    NULL,
     decode_8bit,
     encode_raw,
     charset_get_pos_8bit,
     charset_get_chars_8bit,
     charset_goto_char_8bit,
     charset_goto_line_8bit,
-    1, 0, 0, 10, 0, 0, NULL, NULL,
+    1, 0, 0, 10, 0, 0, table_idem, NULL, NULL,
 };
 
 /********************************************************/
@@ -174,11 +190,6 @@
         return 0;
 }
 
-static void decode_8859_1_init(CharsetDecodeState *s)
-{
-    s->table = table_idem;
-}
-
 static u8 *encode_8859_1(qe__unused__ QECharset *charset, u8 *p, int c)
 {
     if (c <= 0xff) {
@@ -193,24 +204,19 @@
     "8859-1",
     "ISO-8859-1|iso-ir-100|latin1|l1|819",
     probe_8859_1,
-    decode_8859_1_init,
+    NULL,
     decode_8bit,
     encode_8859_1,
     charset_get_pos_8bit,
     charset_get_chars_8bit,
     charset_goto_char_8bit,
     charset_goto_line_8bit,
-    1, 0, 0, 10, 0, 0, NULL, NULL,
+    1, 0, 0, 10, 0, 0, table_idem, NULL, NULL,
 };
 
 /********************************************************/
 /* vt100 */
 
-static void decode_vt100_init(CharsetDecodeState *s)
-{
-    s->table = table_idem;
-}
-
 static u8 *encode_vt100(qe__unused__ QECharset *charset, u8 *p, int c)
 {
     if (c <= 0xff) {
@@ -225,14 +231,14 @@
     "vt100",
     NULL,
     NULL,
-    decode_vt100_init,
+    NULL,
     decode_8bit,
     encode_vt100,
     charset_get_pos_8bit,
     charset_get_chars_8bit,
     charset_goto_char_8bit,
     charset_goto_line_8bit,
-    1, 0, 0, 10, 0, 0, NULL, NULL,
+    1, 0, 0, 10, 0, 0, table_idem, NULL, NULL,
 };
 
 /********************************************************/
@@ -252,14 +258,14 @@
     "7bit",
     "us-ascii|ascii|7-bit|iso-ir-6|ANSI_X3.4|646",
     NULL,
-    decode_8859_1_init,
+    NULL,
     decode_8bit,
     encode_7bit,
     charset_get_pos_8bit,
     charset_get_chars_8bit,
     charset_goto_char_8bit,
     charset_goto_line_8bit,
-    1, 0, 0, 10, 0, 0, NULL, NULL,
+    1, 0, 0, 10, 0, 0, table_idem, NULL, NULL,
 };
 
 /********************************************************/
@@ -435,11 +441,6 @@
         return 0;
 }
 
-static void decode_utf8_init(CharsetDecodeState *s)
-{
-    s->table = table_utf8;
-}
-
 static int decode_utf8_func(CharsetDecodeState *s)
 {
     return utf8_decode((const char **)(void *)&s->p);
@@ -550,16 +551,16 @@
 
 struct QECharset charset_utf8 = {
     "utf-8",
-    "utf8",
+    "utf8|al32utf8",
     probe_utf8,
-    decode_utf8_init,
+    NULL,
     decode_utf8_func,
     encode_utf8,
     charset_get_pos_utf8,
     charset_get_chars_utf8,
     charset_goto_char_utf8,
     charset_goto_line_8bit,
-    1, 1, 0, 10, 0, 0, NULL, NULL,
+    1, 1, 0, 10, 0, 0, table_utf8, NULL, NULL,
 };
 
 /********************************************************/
@@ -603,11 +604,6 @@
         return 0;
 }
 
-static void decode_ucs_init(CharsetDecodeState *s)
-{
-    s->table = table_none;
-}
-
 static int decode_ucs2le(CharsetDecodeState *s)
 {
     /* XXX: should handle surrogates */
@@ -821,28 +817,28 @@
     "ucs2le",
     "utf16le|utf-16le",
     probe_ucs2le,
-    decode_ucs_init,
+    NULL,
     decode_ucs2le,
     encode_ucs2le,
     charset_get_pos_ucs2,
     charset_get_chars_ucs2,
     charset_goto_char_ucs2,
     charset_goto_line_ucs2,
-    2, 0, 0, 10, 0, 0, NULL, NULL,
+    2, 0, 0, 10, 0, 0, table_none, NULL, NULL,
 };
 
 struct QECharset charset_ucs2be = {
     "ucs2be",
     "ucs2|utf16|utf-16|utf16be|utf-16be",
     probe_ucs2be,
-    decode_ucs_init,
+    NULL,
     decode_ucs2be,
     encode_ucs2be,
     charset_get_pos_ucs2,
     charset_get_chars_ucs2,
     charset_goto_char_ucs2,
     charset_goto_line_ucs2,
-    2, 0, 0, 10, 0, 0, NULL, NULL,
+    2, 0, 0, 10, 0, 0, table_none, NULL, NULL,
 };
 
 static int probe_ucs4le(qe__unused__ QECharset *charset, const u8 *buf, int 
size)
@@ -1093,28 +1089,28 @@
     "ucs4le",
     "utf32le|utf-32le",
     probe_ucs4le,
-    decode_ucs_init,
+    NULL,
     decode_ucs4le,
     encode_ucs4le,
     charset_get_pos_ucs4,
     charset_get_chars_ucs4,
     charset_goto_char_ucs4,
     charset_goto_line_ucs4,
-    4, 0, 0, 10, 0, 0, NULL, NULL,
+    4, 0, 0, 10, 0, 0, table_none, NULL, NULL,
 };
 
 struct QECharset charset_ucs4be = {
     "ucs4be",
     "ucs4|utf32|utf-32|utf32be|utf-32be",
     probe_ucs4be,
-    decode_ucs_init,
+    NULL,
     decode_ucs4be,
     encode_ucs4be,
     charset_get_pos_ucs4,
     charset_get_chars_ucs4,
     charset_goto_char_ucs4,
     charset_goto_line_ucs4,
-    4, 0, 0, 10, 0, 0, NULL, NULL,
+    4, 0, 0, 10, 0, 0, table_none, NULL, NULL,
 };
 
 /********************************************************/
@@ -1181,11 +1177,12 @@
 void charset_decode_init(CharsetDecodeState *s, QECharset *charset,
                          EOLType eol_type)
 {
-    s->table = NULL; /* fail safe */
+    s->table = charset->encode_table;  /* default encode table */
     if (charset->table_alloc) {
         s->table = qe_malloc_array(unsigned short, 256);
         if (!s->table) {
             charset = &charset_8859_1;
+            s->table = charset->encode_table;
         }
     }
     s->charset = charset;
@@ -1202,8 +1199,10 @@
 
 void charset_decode_close(CharsetDecodeState *s)
 {
-    if (s->charset->table_alloc)
-        qe_free(&s->table);
+    if (s->charset->table_alloc) {
+        /* remove the const qualifier */
+        qe_free((unsigned short **)&s->table);
+    }
     /* safety */
     memset(s, 0, sizeof(CharsetDecodeState));
 }
@@ -1551,7 +1550,7 @@
     unsigned short *table;
     int i, n;
 
-    table = s->table;
+    table = (unsigned short *)s->table;     /* remove const qualifier */
     for (i = 0; i < charset->min_char; i++)
         *table++ = i;
     n = charset->max_char - charset->min_char + 1;
@@ -1706,36 +1705,16 @@
 
 void charset_init(void)
 {
-    int l, i, n;
-
-    for (i = 0; i < 256; i++) {
-        table_idem[i] = i;
-        table_none[i] = ESCAPE_CHAR;
-    }
-
-    /* utf8 tables */
+    /* initialize unicode_glyph_range_index[] */
+    unsigned int const *ip = unicode_glyph_ranges;
+    unsigned int ucs;
 
-    // could set utf8_length[128...0xc0] to 0 as invalid bytes
-    memset(utf8_length, 1, 256);
-
-    i = 0xc0;
-    l = 2;
-    while (l <= 6) {
-        n = utf8_first_code_mask[l] + 1;
-        while (n > 0) {
-            utf8_length[i++] = l;
-            n--;
-        }
-        l++;
+    for (ucs = 0; ucs < 0x20000; ucs += 0x1000) {
+        while (ucs > ip[0])
+            ip += 2;
+        unicode_glyph_range_index[ucs >> 12] = ip;
     }
 
-    for (i = 0; i < 256; i++)
-        table_utf8[i] = INVALID_CHAR;
-    for (i = 0; i < 0x80; i++)
-        table_utf8[i] = i;
-    for (i = 0xc0; i < 0xfe; i++)
-        table_utf8[i] = ESCAPE_CHAR;
-
     qe_register_charset(&charset_raw);
     qe_register_charset(&charset_8859_1);
     qe_register_charset(&charset_vt100);

Index: charsetjis.c
===================================================================
RCS file: /sources/qemacs/qemacs/charsetjis.c,v
retrieving revision 1.9
retrieving revision 1.10
diff -u -b -r1.9 -r1.10
--- charsetjis.c        16 Sep 2015 22:18:23 -0000      1.9
+++ charsetjis.c        15 Mar 2017 23:42:23 -0000      1.10
@@ -2,6 +2,7 @@
  * JIS Charset handling for QEmacs
  *
  * Copyright (c) 2002 Fabrice Bellard.
+ * Copyright (c) 2002-2017 Charlie Gordon.
  *
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
@@ -78,7 +79,8 @@
 
 static void decode_euc_jp_init(CharsetDecodeState *s)
 {
-    unsigned short *table = s->table;
+    /* XXX: should use static table instead of removing const qualifier */
+    unsigned short *table = (unsigned short *)s->table;
     int i;
 
     for (i = 0; i < 256; i++)
@@ -162,7 +164,8 @@
 
 static void decode_sjis_init(CharsetDecodeState *s)
 {
-    unsigned short *table = s->table;
+    /* XXX: should use static table instead of removing const qualifier */
+    unsigned short *table = (unsigned short *)s->table;
     int i;
 
     for (i = 0; i < 256; i++)

Index: extras.c
===================================================================
RCS file: /sources/qemacs/qemacs/extras.c,v
retrieving revision 1.55
retrieving revision 1.56
diff -u -b -r1.55 -r1.56
--- extras.c    15 Mar 2017 07:24:31 -0000      1.55
+++ extras.c    15 Mar 2017 23:42:23 -0000      1.56
@@ -275,13 +275,13 @@
             col += tw - col % tw;
             continue;
         }
-        col += unicode_glyph_tty_width(c);
+        col += unicode_tty_glyph_width(c);
         if (c != ' ' || offset < start || col % tw == 0)
             continue;
         while (offset1 < stop) {
             c = eb_nextc(b, offset1, &offset2);
             if (c == ' ') {
-                col += unicode_glyph_tty_width(c);
+                col += unicode_tty_glyph_width(c);
                 offset1 = offset2;
                 if (col % tw == 0) {
                     delta = eb_delete_range(b, offset, offset1);
@@ -342,7 +342,7 @@
             continue;
         }
         if (c != '\t') {
-            col += unicode_glyph_tty_width(c);
+            col += unicode_tty_glyph_width(c);
             continue;
         }
         col0 = col;

Index: qe.c
===================================================================
RCS file: /sources/qemacs/qemacs/qe.c,v
retrieving revision 1.251
retrieving revision 1.252
diff -u -b -r1.251 -r1.252
--- qe.c        15 Mar 2017 07:32:48 -0000      1.251
+++ qe.c        15 Mar 2017 23:42:23 -0000      1.252
@@ -1642,7 +1642,7 @@
             if (c == '\t') {
                 col += tw - col % tw;
             } else {
-                col += unicode_glyph_tty_width(c);
+                col += unicode_tty_glyph_width(c);
             }
         }
         if (argval == NO_ARG)
@@ -8070,7 +8070,7 @@
 
 const char str_version[] = "QEmacs version " QE_VERSION;
 const char str_credits[] = "Copyright (c) 2000-2003 Fabrice Bellard\n"
-                           "Copyright (c) 2000-2016 Charlie Gordon\n";
+                           "Copyright (c) 2000-2017 Charlie Gordon\n";
 
 static void show_version(void)
 {

Index: qe.h
===================================================================
RCS file: /sources/qemacs/qemacs/qe.h,v
retrieving revision 1.236
retrieving revision 1.237
diff -u -b -r1.236 -r1.237
--- qe.h        15 Mar 2017 07:32:48 -0000      1.236
+++ qe.h        15 Mar 2017 23:42:23 -0000      1.237
@@ -569,6 +569,7 @@
     /* private data for some charsets */
     u8 eol_char; /* 0x0A for ASCII, 0x25 for EBCDIC */
     u8 min_char, max_char;
+    const unsigned short *encode_table;
     const unsigned short *private_table;
     struct QECharset *next;
 };
@@ -591,7 +592,7 @@
 
 struct CharsetDecodeState {
     /* 256 ushort table for hyper fast decoding */
-    unsigned short *table;
+    const unsigned short *table;
     int char_size;
     EOLType eol_type;
     int eol_char;
@@ -612,7 +613,7 @@
 
 void qe_register_charset(struct QECharset *charset);
 
-extern unsigned char utf8_length[256];
+extern unsigned char const utf8_length[256];
 static inline int utf8_is_trailing_byte(int c) { return (c & 0xC0) == 0x80; }
 int utf8_encode(char *q, int c);
 int utf8_decode(const char **pp);
@@ -635,7 +636,7 @@
 int decode_8bit(CharsetDecodeState *s);
 u8 *encode_8bit(QECharset *charset, u8 *q, int c);
 
-int unicode_glyph_tty_width(unsigned int ucs);
+int unicode_tty_glyph_width(unsigned int ucs);
 
 /* arabic.c */
 int arab_join(unsigned int *line, unsigned int *ctog, int len);

Index: tty.c
===================================================================
RCS file: /sources/qemacs/qemacs/tty.c,v
retrieving revision 1.71
retrieving revision 1.72
diff -u -b -r1.71 -r1.72
--- tty.c       15 Mar 2017 07:24:31 -0000      1.71
+++ tty.c       15 Mar 2017 23:42:23 -0000      1.72
@@ -1033,7 +1033,7 @@
     if (ucs < 0x1100)
         return 1;
 
-    return unicode_glyph_tty_width(ucs);
+    return unicode_tty_glyph_width(ucs);
 }
 
 static void tty_term_text_metrics(QEditScreen *s, qe__unused__ QEFont *font,
@@ -1140,6 +1140,7 @@
     /* We cannot print anything on the bottom right screen cell,
      * pretend it's OK: */
     ts->screen[shadow - 1] = ts->screen[2 * shadow - 1];
+
     for (y = 0; y < s->height; y++) {
         if (ts->line_updated[y]) {
             ts->line_updated[y] = 0;

Index: unihex.c
===================================================================
RCS file: /sources/qemacs/qemacs/unihex.c,v
retrieving revision 1.35
retrieving revision 1.36
diff -u -b -r1.35 -r1.36
--- unihex.c    15 Mar 2017 07:38:58 -0000      1.35
+++ unihex.c    15 Mar 2017 23:42:23 -0000      1.36
@@ -146,11 +146,10 @@
             }
         }
         display_char(ds, offset1, offset2, b);
-#if 0
-        /* CG: spacing out single width glyphs is less readable */
-        if (unicode_glyph_tty_width(b) == 1)
+        /* spacing out single width glyphs may be less readable */
+        if (unicode_tty_glyph_width(b) < 2) {
             display_char(ds, -1, -1, ' ');
-#endif
+        }
     }
     display_eol(ds, -1, -1);
 



reply via email to

[Prev in Thread] Current Thread [Next in Thread]