bug-gnulib
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

new module striconveha


From: Bruno Haible
Subject: new module striconveha
Date: Sun, 21 Jan 2007 23:59:31 +0100 (MET)
User-agent: KMail/1.5.4

The next iconv related module adds support for autodetection of the source
encoding.

2007-01-21  Bruno Haible  <address@hidden>

        * modules/striconveha: New file.
        * lib/striconveha.h: New file.
        * lib/striconveha.c: New file.
        * MODULES.html.sh (Internationalization functions): Add striconveha.
        * lib/striconv.c (str_iconv): Optimize the case of an empty input
        string.
        * lib/striconveh.c (mem_iconveh, str_iconveh): Likewise.

============================== modules/striconveha 
==============================
Description:
Character set conversion of strings with error handling and autodetection,
uses iconv.

Files:
lib/striconveha.h
lib/striconveha.c

Depends-on:
striconveh

configure.ac:

Makefile.am:
lib_SOURCES += striconveha.h striconveha.c

Include:
"striconveha.h"

License:
LGPL

Maintainer:
Bruno Haible

=============================== lib/striconveha.h 
===============================
/* Character set conversion with error handling and autodetection.
   Copyright (C) 2002, 2005, 2007 Free Software Foundation, Inc.
   Written by Bruno Haible.

   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 2, or (at your option)
   any later version.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program; if not, write to the Free Software Foundation,
   Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.  */

#ifndef _STRICONVEHA_H
#define _STRICONVEHA_H

#include "striconveh.h"


#ifdef __cplusplus
extern "C" {
#endif


/* Convert an entire string from one encoding to another, using iconv.
   The original string is at [SRC,...,SRC+SRCLEN-1].
   The "from" encoding can also be a name defined for autodetection.
   *RESULTP and *LENGTH should initially be a scratch buffer and its size,
   or *RESULTP can initially be NULL.
   May erase the contents of the memory at *RESULTP.
   Return value: 0 if successful, otherwise -1 and errno set.
   If successful: The resulting string is stored in *RESULTP and its length
   in *LENGTHP.  *RESULTP is set to a freshly allocated memory block, or is
   unchanged if no dynamic memory allocation was necessary.  */
extern int
       mem_iconveha (const char *src, size_t srclen,
                     const char *from_codeset, const char *to_codeset,
                     enum iconv_ilseq_handler handler,
                     char **resultp, size_t *lengthp);

/* Convert an entire string from one encoding to another, using iconv.
   The original string is the NUL-terminated string starting at SRC.
   Both the "from" and the "to" encoding must use a single NUL byte at the
   end of the string (i.e. not UCS-2, UCS-4, UTF-16, UTF-32).
   The "from" encoding can also be a name defined for autodetection.
   Allocate a malloced memory block for the result.
   Return value: the freshly allocated resulting NUL-terminated string if
   successful, otherwise NULL and errno set.  */
extern char *
       str_iconveha (const char *src,
                     const char *from_codeset, const char *to_codeset,
                     enum iconv_ilseq_handler handler);


/* In the above, FROM_CODESET can also be one of the following values:
      "autodetect_utf8"         supports ISO-8859-1 and UTF-8
      "autodetect_jp"           supports EUC-JP, ISO-2022-JP-2 and SHIFT_JIS
      "autodetect_kr"           supports EUC-KR and ISO-2022-KR
   More names can be defined for autodetection.  */

/* Registers an encoding name for autodetection.
   TRY_IN_ORDER is a NULL terminated list of encodings to be tried.
   Returns 0 upon success, or -1 (with errno set) in case of error.
   Particular errno values: ENOMEM.  */
extern int
       iconv_register_autodetect (const char *name,
                                  const char * const *try_in_order);


#ifdef __cplusplus
}
#endif


#endif /* _STRICONVEHA_H */
=============================== lib/striconveha.c 
===============================
/* Character set conversion with error handling and autodetection.
   Copyright (C) 2002, 2005, 2007 Free Software Foundation, Inc.
   Written by Bruno Haible.

   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 2, or (at your option)
   any later version.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program; if not, write to the Free Software Foundation,
   Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.  */

#include <config.h>

/* Specification.  */
#include "striconveha.h"

#include <errno.h>
#include <stdlib.h>
#include <string.h>

#define SIZEOF(a) (sizeof(a)/sizeof(a[0]))


/* Autodetection list.  */

struct autodetect_alias
{
  struct autodetect_alias *next;
  const char *name;
  const char * const *encodings_to_try;
};

static const char * const autodetect_utf8_try[] =
{
  /* Try UTF-8 first. There are very few ISO-8859-1 inputs that would
     be valid UTF-8, but many UTF-8 inputs are valid ISO-8859-1.  */
  "UTF-8", "ISO-8859-1",
  NULL
};
static const char * const autodetect_jp_try[] =
{
  /* Try 7-bit encoding first. If the input contains bytes >= 0x80,
     it will fail.
     Try EUC-JP next. Short SHIFT_JIS inputs may come out wrong. This
     is unavoidable. People will condemn SHIFT_JIS.
     If we tried SHIFT_JIS first, then some short EUC-JP inputs would
     come out wrong, and people would condemn EUC-JP and Unix, which
     would not be good.
     Finally try SHIFT_JIS.  */
  "ISO-2022-JP-2", "EUC-JP", "SHIFT_JIS",
  NULL
};
static const char * const autodetect_kr_try[] =
{
  /* Try 7-bit encoding first. If the input contains bytes >= 0x80,
     it will fail.
     Finally try EUC-KR.  */
  "ISO-2022-KR", "EUC-KR",
  NULL
};

static struct autodetect_alias autodetect_predefined[] =
{
  { &autodetect_predefined[1], "autodetect_utf8", autodetect_utf8_try },
  { &autodetect_predefined[2], "autodetect_jp",   autodetect_jp_try },
  { NULL,                      "autodetect_kr",   autodetect_kr_try }
};

static struct autodetect_alias *autodetect_list = &autodetect_predefined[0];
static struct autodetect_alias **autodetect_list_end =
  &autodetect_predefined[SIZEOF(autodetect_predefined)-1].next;

int
uniconv_register_autodetect (const char *name,
                             const char * const *try_in_order)
{
  size_t namelen;
  size_t listlen;
  size_t memneed;
  size_t i;
  char *memory;
  struct autodetect_alias *new_alias;
  char *new_name;
  const char **new_try_in_order;

  /* The TRY_IN_ORDER list must not be empty.  */
  if (try_in_order[0] == NULL)
    {
      errno = EINVAL;
      return -1;
    }

  /* We must deep-copy NAME and TRY_IN_ORDER, because they may be allocated
     with dynamic extent.  */
  namelen = strlen (name) + 1;
  memneed = sizeof (struct autodetect_alias) + namelen + sizeof (char *);
  for (i = 0; try_in_order[i] != NULL; i++)
    memneed += sizeof (char *) + strlen (try_in_order[i]) + 1;
  listlen = i;

  memory = (char *) malloc (memneed);
  if (memory != NULL)
    {
      new_alias = (struct autodetect_alias *) memory;
      memory += sizeof (struct autodetect_alias);

      new_try_in_order = (const char **) memory;
      memory += (listlen + 1) * sizeof (char *);

      new_name = (char *) memory;
      memcpy (new_name, name, namelen);
      memory += namelen;

      for (i = 0; i < listlen; i++)
        {
          size_t len = strlen (try_in_order[i]) + 1;
          memcpy (memory, try_in_order[i], len);
          new_try_in_order[i] = (const char *) memory;
          memory += len;
        }
      new_try_in_order[i] = NULL;

      /* Now insert the new alias.  */
      new_alias->name = new_name;
      new_alias->encodings_to_try = new_try_in_order;
      new_alias->next = NULL;
      /* FIXME: Not multithread-safe.  */
      *autodetect_list_end = new_alias;
      autodetect_list_end = &new_alias->next;
      return 0;
    }
  else
    {
      errno = ENOMEM;
      return -1;
    }
}

int
mem_iconveha (const char *src, size_t srclen,
              const char *from_codeset, const char *to_codeset,
              enum iconv_ilseq_handler handler,
              char **resultp, size_t *lengthp)
{
  int retval = mem_iconveh (src, srclen, from_codeset, to_codeset, handler,
                            resultp, lengthp);
  if (retval >= 0 || errno != EINVAL)
    return retval;
  else
    {
      struct autodetect_alias *alias;

      /* Unsupported from_codeset or to_codeset. Check whether the caller
         requested autodetection.  */
      for (alias = autodetect_list; alias != NULL; alias = alias->next)
        if (strcmp (from_codeset, alias->name) == 0)
          {
            const char * const *encodings = alias->encodings_to_try;

            do
              {
                retval = mem_iconveha (src, srclen,
                                       from_codeset, to_codeset, handler,
                                       resultp, lengthp);
                if (!(retval < 0 && errno == EILSEQ))
                  return retval;
                encodings++;
              }
            while (*encodings != NULL);

            /* Return the last call's result.  */
            return -1;
          }

      /* It wasn't an autodetection name.  */
      errno = EINVAL;
      return -1;
    }
}

char *
str_iconveha (const char *src,
              const char *from_codeset, const char *to_codeset,
              enum iconv_ilseq_handler handler)
{
  char *result = str_iconveh (src, from_codeset, to_codeset, handler);

  if (result != NULL || errno != EINVAL)
    return result;
  else
    {
      struct autodetect_alias *alias;

      /* Unsupported from_codeset or to_codeset. Check whether the caller
         requested autodetection.  */
      for (alias = autodetect_list; alias != NULL; alias = alias->next)
        if (strcmp (from_codeset, alias->name) == 0)
          {
            const char * const *encodings = alias->encodings_to_try;

            do
              {
                result = str_iconveha (src, *encodings, to_codeset, handler);
                if (!(result == NULL && errno == EILSEQ))
                  return result;
                encodings++;
              }
            while (*encodings != NULL);

            /* Return the last call's result.  */
            return NULL;
          }

      /* It wasn't an autodetection name.  */
      errno = EINVAL;
      return NULL;
    }
}
=================================================================================





reply via email to

[Prev in Thread] Current Thread [Next in Thread]