[disorder] / lib / charset.c

/*
 * This file is part of DisOrder.
 * Copyright (C) 2004, 2005 Richard Kettlewell
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
 * USA
 */
/** @file lib/charset.c @brief Character set conversion */

#include <config.h>
#include "types.h"

#include <iconv.h>
#include <string.h>
#include <errno.h>
#include <langinfo.h>

#include "mem.h"
#include "log.h"
#include "charset.h"
#include "configuration.h"
#include "utf8.h"
#include "vector.h"

/** @brief Low-level converstion routine
 * @param from Source encoding
 * @param to Destination encoding
 * @param ptr First byte to convert
 * @param n Number of bytes to convert
 * @return Converted text, 0-terminated; or NULL on error.
 */
static void *convert(const char *from, const char *to,
		     const void *ptr, size_t n) {
  iconv_t i;
  size_t len;
  char *buf = 0, *s, *d;
  size_t bufsize = 0, sl, dl;

  if((i = iconv_open(to, from)) == (iconv_t)-1)
    fatal(errno, "error calling iconv_open");
  do {
    bufsize = bufsize ? 2 * bufsize : 32;
    buf = xrealloc_noptr(buf, bufsize);
    iconv(i, 0, 0, 0, 0);
    s = (char *)ptr;
    sl = n;
    d = buf;
    dl = bufsize;
    /* (void *) to work around FreeBSD's nonstandard iconv prototype */
    len = iconv(i, (void *)&s, &sl, &d, &dl);
  } while(len == (size_t)-1 && errno == E2BIG);
  iconv_close(i);
  if(len == (size_t)-1) {
    error(errno, "error converting from %s to %s", from, to);
    return 0;
  }
  return buf;
}

/** @brief Convert UTF-8 to UCS-4
 * @param mb Pointer to 0-terminated UTF-8 string
 * @return Pointer to 0-terminated UCS-4 string
 *
 * Not everybody's iconv supports UCS-4, and it's inconvenient to have to know
 * our endianness, and it's easy to convert it ourselves, so we do.  See also
 * @ref ucs42utf8().
 */ 
uint32_t *utf82ucs4(const char *mb) {
  struct dynstr_ucs4 d;
  uint32_t c;

  dynstr_ucs4_init(&d);
  while(*mb) {
    PARSE_UTF8(mb, c,
	       error(0, "invalid UTF-8 sequence"); return 0;);
    dynstr_ucs4_append(&d, c);
  }
  dynstr_ucs4_terminate(&d);
  return d.vec;
}

/** @brief Convert UCS-4 to UTF-8
 * @param u Pointer to 0-terminated UCS-4 string
 * @return Pointer to 0-terminated UTF-8 string
 *
 * See @ref utf82ucs4().
 */
char *ucs42utf8(const uint32_t *u) {
  struct dynstr d;
  uint32_t c;

  dynstr_init(&d);
  while((c = *u++)) {
    if(c < 0x80)
      dynstr_append(&d, c);
    else if(c < 0x800) {
      dynstr_append(&d, 0xC0 | (c >> 6));
      dynstr_append(&d, 0x80 | (c & 0x3F));
    } else if(c < 0x10000) {
      dynstr_append(&d, 0xE0 | (c >> 12));
      dynstr_append(&d, 0x80 | ((c >> 6) & 0x3F));
      dynstr_append(&d, 0x80 | (c & 0x3F));
    } else if(c < 0x110000) {
      dynstr_append(&d, 0xF0 | (c >> 18));
      dynstr_append(&d, 0x80 | ((c >> 12) & 0x3F));
      dynstr_append(&d, 0x80 | ((c >> 6) & 0x3F));
      dynstr_append(&d, 0x80 | (c & 0x3F));
    } else {
      error(0, "invalid UCS-4 character");
      return 0;
    }
  }
  dynstr_terminate(&d);
  return d.vec;
}

/** @brief Convert from the local multibyte encoding to UTF-8 */
char *mb2utf8(const char *mb) {
  return convert(nl_langinfo(CODESET), "UTF-8", mb, strlen(mb) + 1);
}

/** @brief Convert from UTF-8 to the local multibyte encoding */
char *utf82mb(const char *utf8) {
  return convert("UTF-8", nl_langinfo(CODESET), utf8, strlen(utf8) + 1);
}

/** @brief Convert from encoding @p from to UTF-8 */
char *any2utf8(const char *from, const char *any) {
  return convert(from, "UTF-8", any, strlen(any) + 1);
}

/** @brief Convert from encoding @p from to the local multibyte encoding */
char *any2mb(const char *from, const char *any) {
  if(from) return convert(from, nl_langinfo(CODESET), any, strlen(any) + 1);
  else return xstrdup(any);
}

/** @brief Convert from encoding @p from to encoding @p to */
char *any2any(const char *from,
	      const char *to,
	      const char *any) {
  if(from || to) return convert(from, to, any, strlen(any) + 1);
  else return xstrdup(any);
}

/** @brief strlen workalike for UCS-4 strings
 *
 * We don't rely on the local @c wchar_t being UCS-4.
 */
int ucs4cmp(const uint32_t *a, const uint32_t *b) {
  while(*a && *b && *a == *b) ++a, ++b;
  if(*a > *b) return 1;
  else if(*a < *b) return -1;
  else return 0;
}

/*
Local Variables:
c-basic-offset:2
comment-column:40
End:
*/
Commit	Line	Data
460b9539	1	/*
	2	* This file is part of DisOrder.
	3	* Copyright (C) 2004, 2005 Richard Kettlewell
	4	*
	5	* This program is free software; you can redistribute it and/or modify
	6	* it under the terms of the GNU General Public License as published by
	7	* the Free Software Foundation; either version 2 of the License, or
	8	* (at your option) any later version.
	9	*
	10	* This program is distributed in the hope that it will be useful, but
	11	* WITHOUT ANY WARRANTY; without even the implied warranty of
	12	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	13	* General Public License for more details.
	14	*
	15	* You should have received a copy of the GNU General Public License
	16	* along with this program; if not, write to the Free Software
	17	* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
	18	* USA
	19	*/
14ad73b9	20	/** @file lib/charset.c @brief Character set conversion */
460b9539	21
	22	#include <config.h>
	23	#include "types.h"
	24
	25	#include <iconv.h>
	26	#include <string.h>
	27	#include <errno.h>
	28	#include <langinfo.h>
	29
	30	#include "mem.h"
	31	#include "log.h"
	32	#include "charset.h"
	33	#include "configuration.h"
	34	#include "utf8.h"
	35	#include "vector.h"
	36
14ad73b9 RK	37	/** @brief Low-level converstion routine
	38	* @param from Source encoding
	39	* @param to Destination encoding
	40	* @param ptr First byte to convert
	41	* @param n Number of bytes to convert
	42	* @return Converted text, 0-terminated; or NULL on error.
	43	*/
460b9539	44	static void convert(const char from, const char *to,
	45	const void *ptr, size_t n) {
	46	iconv_t i;
	47	size_t len;
	48	char buf = 0, s, *d;
	49	size_t bufsize = 0, sl, dl;
	50
	51	if((i = iconv_open(to, from)) == (iconv_t)-1)
	52	fatal(errno, "error calling iconv_open");
	53	do {
	54	bufsize = bufsize ? 2 * bufsize : 32;
	55	buf = xrealloc_noptr(buf, bufsize);
	56	iconv(i, 0, 0, 0, 0);
	57	s = (char *)ptr;
	58	sl = n;
	59	d = buf;
	60	dl = bufsize;
	61	/* (void ) to work around FreeBSD's nonstandard iconv prototype /
	62	len = iconv(i, (void *)&s, &sl, &d, &dl);
	63	} while(len == (size_t)-1 && errno == E2BIG);
	64	iconv_close(i);
	65	if(len == (size_t)-1) {
	66	error(errno, "error converting from %s to %s", from, to);
	67	return 0;
	68	}
	69	return buf;
	70	}
	71
14ad73b9 RK	72	/** @brief Convert UTF-8 to UCS-4
	73	* @param mb Pointer to 0-terminated UTF-8 string
	74	* @return Pointer to 0-terminated UCS-4 string
	75	*
	76	* Not everybody's iconv supports UCS-4, and it's inconvenient to have to know
	77	* our endianness, and it's easy to convert it ourselves, so we do. See also
	78	* @ref ucs42utf8().
	79	*/
460b9539	80	uint32_t utf82ucs4(const char mb) {
	81	struct dynstr_ucs4 d;
	82	uint32_t c;
	83
	84	dynstr_ucs4_init(&d);
	85	while(*mb) {
	86	PARSE_UTF8(mb, c,
	87	error(0, "invalid UTF-8 sequence"); return 0;);
	88	dynstr_ucs4_append(&d, c);
	89	}
	90	dynstr_ucs4_terminate(&d);
	91	return d.vec;
	92	}
	93
14ad73b9 RK	94	/** @brief Convert UCS-4 to UTF-8
	95	* @param u Pointer to 0-terminated UCS-4 string
	96	* @return Pointer to 0-terminated UTF-8 string
	97	*
	98	* See @ref utf82ucs4().
	99	*/
460b9539	100	char ucs42utf8(const uint32_t u) {
	101	struct dynstr d;
	102	uint32_t c;
	103
	104	dynstr_init(&d);
	105	while((c = *u++)) {
	106	if(c < 0x80)
	107	dynstr_append(&d, c);
	108	else if(c < 0x800) {
	109	dynstr_append(&d, 0xC0 \| (c >> 6));
	110	dynstr_append(&d, 0x80 \| (c & 0x3F));
	111	} else if(c < 0x10000) {
	112	dynstr_append(&d, 0xE0 \| (c >> 12));
	113	dynstr_append(&d, 0x80 \| ((c >> 6) & 0x3F));
	114	dynstr_append(&d, 0x80 \| (c & 0x3F));
	115	} else if(c < 0x110000) {
	116	dynstr_append(&d, 0xF0 \| (c >> 18));
	117	dynstr_append(&d, 0x80 \| ((c >> 12) & 0x3F));
	118	dynstr_append(&d, 0x80 \| ((c >> 6) & 0x3F));
	119	dynstr_append(&d, 0x80 \| (c & 0x3F));
	120	} else {
	121	error(0, "invalid UCS-4 character");
	122	return 0;
	123	}
	124	}
	125	dynstr_terminate(&d);
	126	return d.vec;
	127	}
	128
14ad73b9	129	/** @brief Convert from the local multibyte encoding to UTF-8 */
460b9539	130	char mb2utf8(const char mb) {
	131	return convert(nl_langinfo(CODESET), "UTF-8", mb, strlen(mb) + 1);
	132	}
	133
14ad73b9	134	/** @brief Convert from UTF-8 to the local multibyte encoding */
460b9539	135	char utf82mb(const char utf8) {
	136	return convert("UTF-8", nl_langinfo(CODESET), utf8, strlen(utf8) + 1);
	137	}
	138
14ad73b9	139	/** @brief Convert from encoding @p from to UTF-8 */
460b9539	140	char any2utf8(const char from, const char *any) {
	141	return convert(from, "UTF-8", any, strlen(any) + 1);
	142	}
	143
14ad73b9	144	/** @brief Convert from encoding @p from to the local multibyte encoding */
460b9539	145	char any2mb(const char from, const char *any) {
	146	if(from) return convert(from, nl_langinfo(CODESET), any, strlen(any) + 1);
	147	else return xstrdup(any);
	148	}
	149
14ad73b9	150	/** @brief Convert from encoding @p from to encoding @p to */
460b9539	151	char any2any(const char from,
	152	const char *to,
	153	const char *any) {
	154	if(from \|\| to) return convert(from, to, any, strlen(any) + 1);
	155	else return xstrdup(any);
	156	}
	157
14ad73b9 RK	158	/** @brief strlen workalike for UCS-4 strings
	159	*
	160	* We don't rely on the local @c wchar_t being UCS-4.
	161	*/
460b9539	162	int ucs4cmp(const uint32_t a, const uint32_t b) {
	163	while(a && b && a == b) ++a, ++b;
	164	if(a > b) return 1;
	165	else if(a < b) return -1;
	166	else return 0;
	167	}
	168
	169	/*
	170	Local Variables:
	171	c-basic-offset:2
	172	comment-column:40
	173	End:
	174	*/