diff -ru gprolog-1.2.16/src/BipsPl/c_supp.c gprolog-1.2.16.patch/src/BipsPl/c_supp.c --- gprolog-1.2.16/src/BipsPl/c_supp.c 2002-04-05 09:47:32.000000000 +0300 +++ gprolog-1.2.16.patch/src/BipsPl/c_supp.c 2005-06-02 00:56:16.389371872 +0300 @@ -439,21 +439,36 @@ * RD_CHAR_CHECK * * * *-------------------------------------------------------------------------*/ -int +long Rd_Char_Check(WamWord start_word) { WamWord word, tag_mask; int atom; +#ifdef USE_UTF8_HACK + wchar_t wc; +#endif DEREF(start_word, word, tag_mask); if (tag_mask == TAG_REF_MASK) Pl_Err_Instantiation(); atom = UnTag_ATM(word); - if (tag_mask != TAG_ATM_MASK || atom_tbl[atom].prop.length != 1) + if (tag_mask != TAG_ATM_MASK || atom_tbl[atom].prop.length != +#ifdef USE_UTF8_HACK + mblen(atom_tbl[atom].name, atom_tbl[atom].prop.length) +#else + 1 +#endif + ) Pl_Err_Type(type_character, word); +#ifdef USE_UTF8_HACK + wc = atom_tbl[atom].name[0]; + mbtowc(&wc, atom_tbl[atom].name, atom_tbl[atom].prop.length); + return wc; +#else return atom_tbl[atom].name[0]; +#endif } @@ -463,16 +478,25 @@ * RD_CHAR * * * *-------------------------------------------------------------------------*/ -int +long Rd_Char(WamWord start_word) { WamWord word, tag_mask; int atom; +#ifdef USE_UTF8_HACK + wchar_t wc; +#endif DEREF(start_word, word, tag_mask); atom = UnTag_ATM(word); +#ifdef USE_UTF8_HACK + wc = atom_tbl[atom].name[0]; + mbtowc(&wc, atom_tbl[atom].name, atom_tbl[atom].prop.length); + return wc; +#else return atom_tbl[atom].name[0]; +#endif } @@ -482,11 +506,14 @@ * RD_IN_CHAR_CHECK * * * *-------------------------------------------------------------------------*/ -int +long Rd_In_Char_Check(WamWord start_word) { WamWord word, tag_mask; int atom; +#ifdef USE_UTF8_HACK + wchar_t wc; +#endif DEREF(start_word, word, tag_mask); if (tag_mask == TAG_REF_MASK) @@ -494,10 +521,25 @@ atom = UnTag_ATM(word); if (tag_mask != TAG_ATM_MASK || - (atom != atom_end_of_file && atom_tbl[atom].prop.length != 1)) + (atom != atom_end_of_file && atom_tbl[atom].prop.length != +#ifdef USE_UTF8_HACK + mblen(atom_tbl[atom].name, atom_tbl[atom].prop.length) +#else + 1 +#endif + )) Pl_Err_Type(type_in_character, word); - return (atom != atom_end_of_file) ? atom_tbl[atom].name[0] : -1; + if (atom == atom_end_of_file) + return -1; + +#ifdef USE_UTF8_HACK + wc = atom_tbl[atom].name[0]; + mbtowc(&wc, atom_tbl[atom].name, atom_tbl[atom].prop.length); + return wc; +#else + return atom_tbl[atom].name[0]; +#endif } @@ -507,15 +549,28 @@ * RD_IN_CHAR * * * *-------------------------------------------------------------------------*/ -int +long Rd_In_Char(WamWord start_word) { WamWord word, tag_mask; int atom; +#ifdef USE_UTF8_HACK + wchar_t wc; +#endif DEREF(start_word, word, tag_mask); atom = UnTag_ATM(word); - return (atom != atom_end_of_file) ? atom_tbl[atom].name[0] : -1; + + if (atom == atom_end_of_file) + return -1; + +#ifdef USE_UTF8_HACK + wc = atom_tbl[atom].name[0]; + mbtowc(&wc, atom_tbl[atom].name, atom_tbl[atom].prop.length); + return wc; +#else + return atom_tbl[atom].name[0]; +#endif } @@ -525,10 +580,10 @@ * RD_CODE_CHECK * * * *-------------------------------------------------------------------------*/ -int +long Rd_Code_Check(WamWord start_word) { - int c; + long c; c = Rd_Integer_Check(start_word); if (!Is_Valid_Code(c)) @@ -544,7 +599,7 @@ * RD_CODE * * * *-------------------------------------------------------------------------*/ -int +long Rd_Code(WamWord start_word) { return Rd_Integer(start_word); @@ -557,10 +612,10 @@ * RD_IN_CODE_CHECK * * * *-------------------------------------------------------------------------*/ -int +long Rd_In_Code_Check(WamWord start_word) { - int c; + long c; c = Rd_Integer_Check(start_word); if (c != -1 && !Is_Valid_Code(c)) @@ -576,7 +631,7 @@ * RD_IN_CODE * * * *-------------------------------------------------------------------------*/ -int +long Rd_In_Code(WamWord start_word) { return Rd_Integer(start_word); @@ -761,6 +816,9 @@ WamWord save_start_word; WamWord *lst_adr; int n = 0; +#ifdef USE_UTF8_HACK + int clen; +#endif save_start_word = start_word; @@ -779,8 +837,18 @@ lst_adr = UnTag_LST(word); +#ifdef USE_UTF8_HACK + wctomb(NULL,0); + clen = wctomb(str, Rd_Char_Check(Car(lst_adr))); + if (clen > 0) + { + str += clen; + n += clen; + } +#else *str++ = Rd_Char_Check(Car(lst_adr)); n++; +#endif start_word = Cdr(lst_adr); } @@ -803,6 +871,9 @@ WamWord save_start_word; WamWord *lst_adr; int n = 0; +#ifdef USE_UTF8_HACK + int clen; +#endif save_start_word = start_word; @@ -815,8 +886,18 @@ lst_adr = UnTag_LST(word); +#ifdef USE_UTF8_HACK + wctomb(NULL,0); + clen = wctomb(str, Rd_Char_Check(Car(lst_adr))); + if (clen > 0) + { + str += clen; + n += clen; + } +#else *str++ = Rd_Char_Check(Car(lst_adr)); n++; +#endif start_word = Cdr(lst_adr); } @@ -839,6 +920,9 @@ WamWord save_start_word; WamWord *lst_adr; int n = 0; +#ifdef USE_UTF8_HACK + int clen; +#endif save_start_word = start_word; @@ -857,8 +941,18 @@ lst_adr = UnTag_LST(word); +#ifdef USE_UTF8_HACK + wctomb(NULL,0); + clen = wctomb(str, Rd_Code_Check(Car(lst_adr))); + if (clen > 0) + { + str += clen; + n += clen; + } +#else *str++ = Rd_Code_Check(Car(lst_adr)); n++; +#endif start_word = Cdr(lst_adr); } @@ -881,6 +975,9 @@ WamWord save_start_word; WamWord *lst_adr; int n = 0; +#ifdef USE_UTF8_HACK + int clen; +#endif save_start_word = start_word; @@ -893,8 +990,18 @@ lst_adr = UnTag_LST(word); +#ifdef USE_UTF8_HACK + wctomb(NULL,0); + clen = wctomb(str, Rd_Code_Check(Car(lst_adr))); + if (clen > 0) + { + str += clen; + n += clen; + } +#else *str++ = Rd_Code_Check(Car(lst_adr)); n++; +#endif start_word = Cdr(lst_adr); } @@ -1836,9 +1943,27 @@ Bool Un_Chars(char *str, WamWord start_word) { + int atom; +#ifdef USE_UTF8_HACK + int clen = 1; + int len = strlen(str); + for (; *str; str+=clen, len-=clen) + { + wchar_t wc; + if ( (mbtowc(NULL,NULL,0), clen = mbtowc(&wc,str,len)) > 1 && wc >= 256) + atom = Create_Char_Atom(wc); + else + { + clen = 1; + atom = ATOM_CHAR(*str); + } +#else for (; *str; str++) { - if (!Get_List(start_word) || !Unify_Atom(ATOM_CHAR(*str))) + atom = ATOM_CHAR(*str); +#endif + + if (!Get_List(start_word) || !Unify_Atom(atom)) return FALSE; start_word = Unify_Variable(); @@ -1872,9 +1997,24 @@ Bool Un_Codes(char *str, WamWord start_word) { +#ifdef USE_UTF8_HACK + int clen = 1; + int len = strlen(str); + for (; *str; str+=clen, len-=clen) + { + wchar_t c; + if ( (mbtowc(NULL,NULL,0), clen = mbtowc(&c,str,len)) < 1 ) + { + clen = 1; + c = *str; + } +#else for (; *str; str++) { - if (!Get_List(start_word) || !Unify_Integer(*str)) + unsigned char c = *str; +#endif + + if (!Get_List(start_word) || !Unify_Integer(c)) return FALSE; start_word = Unify_Variable(); diff -ru gprolog-1.2.16/src/BipsPl/c_supp.h gprolog-1.2.16.patch/src/BipsPl/c_supp.h --- gprolog-1.2.16/src/BipsPl/c_supp.h 2002-03-19 20:24:34.000000000 +0200 +++ gprolog-1.2.16.patch/src/BipsPl/c_supp.h 2005-06-02 00:56:16.389371872 +0300 @@ -64,21 +64,21 @@ int Rd_Boolean(WamWord start_word); -int Rd_Char_Check(WamWord start_word); +long Rd_Char_Check(WamWord start_word); -int Rd_Char(WamWord start_word); +long Rd_Char(WamWord start_word); -int Rd_In_Char_Check(WamWord start_word); +long Rd_In_Char_Check(WamWord start_word); -int Rd_In_Char(WamWord start_word); +long Rd_In_Char(WamWord start_word); -int Rd_Code_Check(WamWord start_word); +long Rd_Code_Check(WamWord start_word); -int Rd_Code(WamWord start_word); +long Rd_Code(WamWord start_word); -int Rd_In_Code_Check(WamWord start_word); +long Rd_In_Code_Check(WamWord start_word); -int Rd_In_Code(WamWord start_word); +long Rd_In_Code(WamWord start_word); int Rd_Byte_Check(WamWord start_word); diff -ru gprolog-1.2.16/src/BipsPl/scan_supp.c gprolog-1.2.16.patch/src/BipsPl/scan_supp.c --- gprolog-1.2.16/src/BipsPl/scan_supp.c 2002-04-05 09:47:32.000000000 +0300 +++ gprolog-1.2.16.patch/src/BipsPl/scan_supp.c 2005-06-02 00:56:16.390371720 +0300 @@ -42,6 +42,9 @@ /*---------------------------------* * Constants * *---------------------------------*/ +#ifndef USE_UTF8_HACK +#define wint_t int +#endif /*---------------------------------* * Type Definitions * @@ -69,8 +72,8 @@ static void Scan_Quoted(StmInf *pstm); -static int Scan_Quoted_Char(StmInf *pstm, Bool convert, int c0, - Bool no_escape); +static wint_t Scan_Quoted_Char(StmInf *pstm, Bool convert, int c0, + Bool no_escape); @@ -103,6 +106,10 @@ * READ_NEXT_CHAR * * * *-------------------------------------------------------------------------*/ +#ifdef USE_UTF8_HACK +static int +UTF8_Hack_Peek_Next_Char(StmInf *pstm, Bool convert); +#endif static int Read_Next_Char(StmInf *pstm, Bool convert) { @@ -115,11 +122,62 @@ if (convert) c = Char_Conversion(c); - c_type = char_type[c]; +#ifdef USE_UTF8_HACK + if (c >= 192) + c_type = UTF8_Hack_Peek_Next_Char(pstm,convert); + else if ( pstm->char_count <= pstm->utf8_end + && pstm->utf8_begin < pstm->char_count ) + c_type = pstm->utf8_ctype; + else +#endif + c_type = char_type[c]; } return c; } +#ifdef USE_UTF8_HACK +static int +UTF8_Hack_Peek_Next_Char(StmInf *pstm, Bool convert) +{ + int i, len; + unsigned char cs[6]; + wchar_t wc; + + if (!(c&0x40)) return pstm->utf8_ctype; + + pstm->utf8_end = pstm->utf8_begin = pstm->char_count-1; + + cs[0] = c; + for (i=1, c<<=1 ; i<6 && c&0x80 ; c<<=1) + { + int a = Stream_Getc(pstm); + if (a == EOF) + goto on_eof; + cs[i++] = a; + } + len = i; + while (--i) + Stream_Ungetc(cs[i],pstm); + c = cs[0]; + + mbtowc(NULL,NULL,0); + if (mbtowc(&wc, cs, len) < 2) goto bad_char; + + /*if (convert) + wc = Char_Conversion(wc);*/ + + pstm->utf8_ctype = UTF8_Hack_Classify_Char(wc); + pstm->utf8_end = pstm->utf8_begin+len; + return pstm->utf8_ctype; + +on_eof: + while (--i) + Stream_Ungetc(cs[i],pstm); + c = cs[0]; +bad_char: + return char_type[c]; +} +#endif @@ -321,6 +379,7 @@ if (!integer_only && /* float if . and digit */ c == '.' && isdigit(Scan_Peek_Char(pstm, TRUE))) goto is_a_float; + /* integer number */ token.type = TOKEN_INTEGER; *p++ = '\0'; @@ -330,15 +389,16 @@ if (c == '\'') /* 0' */ { - c = Scan_Quoted_Char(pstm, TRUE, '\'', FALSE); - if (c == -1) /* is ' */ + wint_t wc = Scan_Quoted_Char(pstm, TRUE, '\'', FALSE); + + if (wc == -1) /* is ' */ { token.line = pstm->line_count + 1; token.col = pstm->line_pos + 1; err_msg = "quote character expected here"; } - if (c == -2 || c == -3) + if (wc == -2 || wc == -3) /* EOF or NL or \ NL */ { Unget_Last_Char; @@ -347,7 +407,7 @@ err_msg = "character expected here"; } - token.int_num = c; + token.int_num = wc; return; } @@ -449,20 +509,25 @@ for (;;) { - c = Scan_Quoted_Char(pstm, convert, c0, no_escape); - if (c == -1) + wint_t wc = Scan_Quoted_Char(pstm, convert, c0, no_escape); + if (wc == -1) { *s = '\0'; return; } - if (c == -2) /* EOF or \n */ + if (wc == -2) /* EOF or \n */ break; - if (c == -3) /* \ followed by \n */ + if (wc == -3) /* \ followed by \n */ continue; - *s++ = c; +#ifdef USE_UTF8_HACK + wctomb(NULL,0); + s += wctomb(s,wc); +#else + *s++ = wc; +#endif } /* error */ *s = '\0'; @@ -501,12 +566,13 @@ * SCAN_QUOTED_CHAR * * * *-------------------------------------------------------------------------*/ -static int +static wint_t Scan_Quoted_Char(StmInf *pstm, Bool convert, int c0, Bool no_escape) { - int radix; + int shift; char *p, *f; - int x, i; + int i; + wint_t x; Read_Next_Char(pstm, convert); if (c == c0) @@ -538,13 +604,13 @@ { if (c == 'x') { - radix = 16; + shift = 4; f = "0123456789abcdefABCDEF"; x = 0; } else { - radix = 8; + shift = 3; f = "01234567"; x = c - '0'; } @@ -555,7 +621,8 @@ i = p - f; if (i >= 16) i -= 6; - x = x * radix + i; + x = (x<line_pos; err_msg = "invalid character code in \\constant\\ sequence"; } + if (c != '\\') { if (err_msg == NULL) @@ -577,7 +645,7 @@ Unget_Last_Char; } - return (int) (unsigned char) x; + return (wint_t) (unsigned long) x; } if (err_msg == NULL) diff -ru gprolog-1.2.16/src/BipsPl/stream_supp.c gprolog-1.2.16.patch/src/BipsPl/stream_supp.c --- gprolog-1.2.16/src/BipsPl/stream_supp.c 2002-09-19 14:00:36.000000000 +0300 +++ gprolog-1.2.16.patch/src/BipsPl/stream_supp.c 2005-06-02 00:56:16.390371720 +0300 @@ -401,6 +401,12 @@ pstm->line_count = 0; pstm->line_pos = 0; PB_Init(pstm->pb_line_pos); + +#ifdef USE_UTF8_HACK + pstm->utf8_ctype = 0; + pstm->utf8_begin = 0; + pstm->utf8_end = 0; +#endif } diff -ru gprolog-1.2.16/src/BipsPl/stream_supp.h gprolog-1.2.16.patch/src/BipsPl/stream_supp.h --- gprolog-1.2.16/src/BipsPl/stream_supp.h 2002-05-07 20:45:48.000000000 +0300 +++ gprolog-1.2.16.patch/src/BipsPl/stream_supp.h 2005-06-02 00:56:16.390371720 +0300 @@ -145,6 +145,11 @@ int line_count; /* line read count */ int line_pos; /* line position */ PbStk pb_line_pos; /* line position push back stack */ +#ifdef USE_UTF8_HACK /* -------- UTF-8 hack ---------- */ + int utf8_ctype; /* type of multibyte character||0 */ + int utf8_begin; /* beginning of sequence position */ + int utf8_end; /* end of sequence position */ +#endif } StmInf; diff -ru gprolog-1.2.16/src/BipsPl/write_supp.c gprolog-1.2.16.patch/src/BipsPl/write_supp.c --- gprolog-1.2.16/src/BipsPl/write_supp.c 2002-04-09 11:26:42.000000000 +0300 +++ gprolog-1.2.16.patch/src/BipsPl/write_supp.c 2005-06-02 00:56:16.390371720 +0300 @@ -503,7 +503,14 @@ if (prop.needs_scan) { +#ifdef USE_UTF8_HACK + int len = prop.length; + int clen; + wchar_t wc; + for (p = atom_tbl[atom].name; *p; p++, len--) +#else for (p = atom_tbl[atom].name; *p; p++) +#endif if ((q = (char *) strchr(escape_char, *p))) { Out_Char('\\'); @@ -514,6 +521,23 @@ Out_Char(*p); Out_Char(*p); } +#ifdef USE_UTF8_HACK + else if ( (mbtowc(NULL,NULL,0), clen = mbtowc(&wc,p,len)) >= 2 ) + { + if (!iswprint(wc)) + { + sprintf(str, "\\x%lx\\", wc); + Out_String(str); + p += clen-1; + } + else + { + while (--clen) + Out_Char(*p++); + Out_Char(*p); + } + } +#endif else if (!isprint(*p)) { sprintf(str, "\\x%x\\", (unsigned) (unsigned char) *p); diff -ru gprolog-1.2.16/src/configure.in gprolog-1.2.16.patch/src/configure.in --- gprolog-1.2.16/src/configure.in 2002-09-19 13:57:32.000000000 +0300 +++ gprolog-1.2.16.patch/src/configure.in 2005-06-02 00:56:16.388372024 +0300 @@ -45,6 +45,7 @@ USE_GUI_CONSOLE=yes USE_SOCKETS=yes USE_FD_SOLVER=yes +USE_UTF8_HACK=yes DLL_W32GUICONS=w32guicons LIB_LINEDIT=liblinedit @@ -201,6 +202,12 @@ *) USE_FD_SOLVER=yes;; esac]) +AC_ARG_ENABLE(utf8-hack, [ --enable-utf8-hack recognise types of UTF-8 characters], + [case "$enableval" in + yes) AC_DEFINE(USE_UTF8_HACK) USE_UTF8_HACK=yes;; + *) USE_UTF8_HACK=no;; + esac]) + # *********************** diff -ru gprolog-1.2.16/src/EnginePl/atom.c gprolog-1.2.16.patch/src/EnginePl/atom.c --- gprolog-1.2.16/src/EnginePl/atom.c 2002-03-19 20:24:36.000000000 +0200 +++ gprolog-1.2.16.patch/src/EnginePl/atom.c 2005-06-02 00:56:16.390371720 +0300 @@ -29,6 +29,9 @@ #include #include #include +#ifdef USE_UTF8_HACK +#include +#endif #define ATOM_FILE @@ -258,10 +261,13 @@ AtomInf *patom; AtomProp prop; char *p; - int c_type; + int c_type, first_c_type; int lg; Bool identifier; Bool graphic; +#ifdef USE_UTF8_HACK + int clen, len; +#endif patom = Locate_Atom(name); @@ -279,9 +285,28 @@ identifier = graphic = (*name != '\0'); + prop.length = lg = strlen(name); + +#ifdef USE_UTF8_HACK + len = lg; + for (p = name; *p; p += clen, len -= clen) + { + wchar_t wc; + if ( (mbtowc(NULL,NULL,0), clen = mbtowc(&wc, p, len)) < 2) + { + clen = 1; + c_type = char_type[(unsigned char) *p]; + } + else + c_type = UTF8_Hack_Classify_Char(wc); +#else for (p = name; *p; p++) { c_type = char_type[(unsigned char) *p]; +#endif + + if (p == name) + first_c_type = c_type; if ((c_type & (UL | CL | SL | DI)) == 0) identifier = FALSE; @@ -293,14 +318,12 @@ prop.needs_scan = TRUE; } - prop.length = lg = p - name; - #ifndef NO_USE_LINEDIT if (lg > 1 && identifier) LE_Compl_Add_Word(name, lg); #endif - if (char_type[(unsigned char) *name] != SL) /* small letter */ + if (first_c_type != SL) /* small letter */ identifier = FALSE; @@ -318,7 +341,7 @@ goto finish; } - if (lg == 1 && char_type[(unsigned char) *name] == SC) + if (lg == 1 && first_c_type == SC) { prop.type = SOLO_ATOM; prop.needs_quote = (*name == ','); @@ -340,9 +363,40 @@ +#ifdef USE_UTF8_HACK +/*-------------------------------------------------------------------------* + * CREATE_CHAR_ATOM * + * * + *-------------------------------------------------------------------------*/ +int Create_Char_Atom(unsigned long wc) +{ + char c[7]; + int len = wctomb(c, wc); + c[len] = 0; + return Create_Allocate_Atom(c); +} /*-------------------------------------------------------------------------* - * CREATE_ATOM * + * UTF8_HACK_CLASSIFY_CHAR * + * * + *-------------------------------------------------------------------------*/ +int +UTF8_Hack_Classify_Char(unsigned long wc) +{ + if (wc < 256) return char_type[wc]; + if (iswupper(wc)) return CL; + if (iswlower(wc)) return SL; + if (iswpunct(wc)) return GR; + if (iswcntrl(wc) || iswspace(wc)) return LA; + return EX; +} +#endif + + + + +/*-------------------------------------------------------------------------* + * CREATE_ATOM_TAGGED * * * * Called by compiled prolog code. * *-------------------------------------------------------------------------*/ diff -ru gprolog-1.2.16/src/EnginePl/atom.h gprolog-1.2.16.patch/src/EnginePl/atom.h --- gprolog-1.2.16/src/EnginePl/atom.h 2002-03-19 20:24:36.000000000 +0200 +++ gprolog-1.2.16.patch/src/EnginePl/atom.h 2005-06-02 00:56:16.390371720 +0300 @@ -68,7 +68,11 @@ +#ifndef USE_UTF8_HACK #define Is_Valid_Code(c) ((unsigned) (c)-1 <256-1) /* 1<= c <256 */ +#else +#define Is_Valid_Code(c) ((signed long)(c) > 0) /* UTF-8 is 31 bits */ +#endif #define Is_Valid_Byte(c) ((unsigned) (c) <256) /* 0=< c <256 */ #define Is_Valid_Atom(a) ((a)>=0 && (a)