bug-gnulib
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: proper names


From: Bruno Haible
Subject: Re: proper names
Date: Sun, 18 May 2008 15:24:22 +0200
User-agent: KMail/1.5.4

On 2006-09-07, I replied to Paul Eggert:
> > Second, this code does not look right:
> > 
> > >       /* See whether the translation contains the original name.  */
> > >       if (strstr (translation, name) != NULL)
> > 
> > Some translations might contain the name accidentally.
> > As an extreme case, the proper name "Z" (see
> > <http://itre.cis.upenn.edu/~myl/languagelog/archives/002480.html>)
> > might have a translation that contains the letter "Z".
> 
> OK, I'll make the test stricter, testing whether the occurrence of
> trim (name) inside the translation starts and ends at word boundaries.

Implemented as follows:

2008-05-18  Bruno Haible  <address@hidden>

        * lib/propername.c: Include <stdbool.h>, <ctype.h>, trim.h, mbchar.h,
        mbuiter.h. Don't include c-strstr.h.
        (mbsstr_trimmed_wordbounded): New function.
        (proper_name, proper_name_utf8): Use it instead of mbsstr or c_strstr.
        * modules/propername (Depends-on): Add stdbool, trim, mbchar, mbuiter.
        Remove c-strstr.
        Reported by Paul Eggert <address@hidden>.

*** lib/propername.c    7 Oct 2007 19:35:37 -0000       1.9
--- lib/propername.c    18 May 2008 13:20:13 -0000
***************
*** 1,5 ****
  /* Localization of proper names.
!    Copyright (C) 2006-2007 Free Software Foundation, Inc.
     Written by Bruno Haible <address@hidden>, 2006.
  
     This program is free software: you can redistribute it and/or modify
--- 1,5 ----
  /* Localization of proper names.
!    Copyright (C) 2006-2008 Free Software Foundation, Inc.
     Written by Bruno Haible <address@hidden>, 2006.
  
     This program is free software: you can redistribute it and/or modify
***************
*** 20,25 ****
--- 20,27 ----
  /* Specification.  */
  #include "propername.h"
  
+ #include <ctype.h>
+ #include <stdbool.h>
  #include <stdio.h>
  #include <stdlib.h>
  #include <string.h>
***************
*** 27,40 ****
  # include <iconv.h>
  #endif
  
  #include "localcharset.h"
  #include "c-strcase.h"
  #include "xstriconv.h"
- #include "c-strstr.h"
  #include "xalloc.h"
  #include "gettext.h"
  
  
  /* Return the localization of NAME.  NAME is written in ASCII.  */
  
  const char *
--- 29,155 ----
  # include <iconv.h>
  #endif
  
+ #include "trim.h"
+ #include "mbchar.h"
+ #if HAVE_MBRTOWC
+ # include "mbuiter.h"
+ #endif
  #include "localcharset.h"
  #include "c-strcase.h"
  #include "xstriconv.h"
  #include "xalloc.h"
  #include "gettext.h"
  
  
+ /* Tests whether STRING contains trim (SUB), starting and ending at word
+    boundaries.
+    Here, instead of implementing Unicode Standard Annex #29 for determining
+    word boundaries, we assume that trim (SUB) starts and ends with words and
+    only test whether the part before it ends with a non-word and the part
+    after it starts with a non-word.  */
+ static bool
+ mbsstr_trimmed_wordbounded (const char *string, const char *sub)
+ {
+   char *tsub = trim (sub);
+   bool found = false;
+ 
+   for (; *string != '\0';)
+     {
+       const char *tsub_in_string = mbsstr (string, tsub);
+       if (tsub_in_string == NULL)
+       break;
+       else
+       {
+ #if HAVE_MBRTOWC
+         if (MB_CUR_MAX > 1)
+           {
+             mbui_iterator_t string_iter;
+             bool word_boundary_before;
+             bool word_boundary_after;
+ 
+             mbui_init (string_iter, string);
+             word_boundary_before = true;
+             if (mbui_cur_ptr (string_iter) < tsub_in_string)
+               {
+                 mbchar_t last_char_before_tsub;
+                 do
+                   {
+                     if (!mbui_avail (string_iter))
+                       abort ();
+                     last_char_before_tsub = mbui_cur (string_iter);
+                     mbui_advance (string_iter);
+                   }
+                 while (mbui_cur_ptr (string_iter) < tsub_in_string);
+                 if (mb_isalnum (last_char_before_tsub))
+                   word_boundary_before = false;
+               }
+ 
+             mbui_init (string_iter, tsub_in_string);
+             {
+               mbui_iterator_t tsub_iter;
+ 
+               for (mbui_init (tsub_iter, tsub);
+                    mbui_avail (tsub_iter);
+                    mbui_advance (tsub_iter))
+                 {
+                   if (!mbui_avail (string_iter))
+                     abort ();
+                   mbui_advance (string_iter);
+                 }
+             }
+             word_boundary_after = true;
+             if (mbui_avail (string_iter))
+               {
+                 mbchar_t first_char_after_tsub = mbui_cur (string_iter);
+                 if (mb_isalnum (first_char_after_tsub))
+                   word_boundary_after = false;
+               }
+ 
+             if (word_boundary_before && word_boundary_after)
+               {
+                 found = true;
+                 break;
+               }
+ 
+             mbui_init (string_iter, tsub_in_string);
+             if (!mbui_avail (string_iter))
+               break;
+             string = tsub_in_string + mb_len (mbui_cur (string_iter));
+           }
+         else
+ #endif /* HAVE_MBRTOWC */
+           {
+             bool word_boundary_before;
+             const char *p;
+             bool word_boundary_after;
+ 
+             word_boundary_before = true;
+             if (string < tsub_in_string)
+               if (isalnum ((unsigned char) tsub_in_string[-1]))
+                 word_boundary_before = false;
+ 
+             p = tsub_in_string + strlen (tsub);
+             word_boundary_after = true;
+             if (*p != '\0')
+               if (isalnum ((unsigned char) *p))
+                 word_boundary_after = false;
+ 
+             if (word_boundary_before && word_boundary_after)
+               {
+                 found = true;
+                 break;
+               }
+ 
+             if (*tsub_in_string == '\0')
+               break;
+             string = tsub_in_string + 1;
+           }
+       }
+     }
+   free (tsub);
+   return found;
+ }
+ 
  /* Return the localization of NAME.  NAME is written in ASCII.  */
  
  const char *
***************
*** 46,52 ****
    if (translation != name)
      {
        /* See whether the translation contains the original name.  */
!       if (mbsstr (translation, name) != NULL)
        return translation;
        else
        {
--- 161,167 ----
    if (translation != name)
      {
        /* See whether the translation contains the original name.  */
!       if (mbsstr_trimmed_wordbounded (translation, name))
        return translation;
        else
        {
***************
*** 116,128 ****
  
    if (translation != name_ascii)
      {
!       /* See whether the translation contains the original name.
!        The multibyte-aware mbsstr() is not absolutely necessary here.  */
!       if (c_strstr (translation, name_ascii) != NULL
          || (name_converted != NULL
!             && mbsstr (translation, name_converted) != NULL)
          || (name_converted_translit != NULL
!             && mbsstr (translation, name_converted_translit) != NULL))
        {
          if (alloc_name_converted != NULL)
            free (alloc_name_converted);
--- 231,242 ----
  
    if (translation != name_ascii)
      {
!       /* See whether the translation contains the original name.  */
!       if (mbsstr_trimmed_wordbounded (translation, name_ascii)
          || (name_converted != NULL
!             && mbsstr_trimmed_wordbounded (translation, name_converted))
          || (name_converted_translit != NULL
!             && mbsstr_trimmed_wordbounded (translation, 
name_converted_translit)))
        {
          if (alloc_name_converted != NULL)
            free (alloc_name_converted);
***************
*** 155,157 ****
--- 269,283 ----
        return name;
      }
  }
+ 
+ #ifdef TEST
+ # include <locale.h>
+ int
+ main (int argc, char *argv[])
+ {
+   setlocale (LC_ALL, "");
+   if (mbsstr_trimmed_wordbounded (argv[1], argv[2]))
+     printf("found\n");
+   return 0;
+ }
+ #endif
*** modules/propername  18 May 2008 11:39:57 -0000      1.4
--- modules/propername  18 May 2008 13:20:13 -0000
***************
*** 12,23 ****
  lib/propername.c
  
  Depends-on:
  iconv
  localcharset
  c-strcase
  xstriconv
- c-strstr
- mbsstr
  xalloc
  gettext-h
  
--- 12,26 ----
  lib/propername.c
  
  Depends-on:
+ stdbool
+ trim
+ mbsstr
+ mbchar
+ mbuiter
  iconv
  localcharset
  c-strcase
  xstriconv
  xalloc
  gettext-h
  





reply via email to

[Prev in Thread] Current Thread [Next in Thread]