[JDEV] Question on how to encode unicode into utf8 for jabber

Dirk-Willem van Gulik dirkx at covalent.net
Thu Oct 18 13:20:18 CDT 2001



On Thu, 18 Oct 2001, David Rainville wrote:

> Hi Everyone,

>     I tried to encode the Unicode character set to fit in the utf8 caracter
> set.. I encoded every caracter in this structure : \uxxxx where x are
> hexadecimal digits It works on my client because I decode it in a way that
> it converts the \uxxxx to the unicode caracter. . Is it the way to do it?
> Will every other client have this as a standard?

No. UTF does not quite work that way. You may want to get yourself a copy
of the Unicode Standard (www.unicode.com or amazon.com :-). See attached
little routines.

Dw
-------------- next part --------------
/*
 * ====================================================================
 * Copyright (c) 1999 Dirk-Willem van Gulik - WebWeaving m/v
 * 
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are
 * met:
 * 
 * 1. Redistributions of source code must retain the above copyright notice,
 * this list of conditions and the following disclaimer.
 * 
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 * this list of conditions and the following disclaimer in the documentation
 * and/or other materials provided with the distribution.
 * 
 * 3. All advertising materials mentioning features or use of this software must
 * display the following acknowledgment: "This product includes software
 * developed by WebWeaving Consulancy (http://www.webweaving.org).
 * 
 * 4. The name "WebWeaving", must not be used to endorse or promote products
 * derived from this software without prior written permission. For written
 * permission, please contact dirkx at webweaving.org.
 * 
 * 5. Redistributions of any form whatsoever must retain the following
 * acknowledgment: "This product includes software developed by WebWeaving
 * for use in the Apache HTTP server project (http://www.apache.org/)."
 * 
 * THIS SOFTWARE IS PROVIDED BY WEBWEAVING AND AFFILIATES ``AS IS'' AND ANY
 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL WEBWEAVING OR ITS AFFILIATES OR ITS
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 * ====================================================================
 * 
 * Simple UTF8 / Unicode / LatinX conversion utilities. Note that these are
 * incomplete and ONLY do the first few tens of codepages. The higher ones
 * (i.e. the full 32 bits) are left as an excersize to the reader.. and have
 * not been defined anyway at the time of writing this...
 * 
 * Version 0.00	Winter 1997	First version 0.01 Fall 1991	minor speed
 * ups. 0.02 May 1995	Alpine; apache pool 0.04 Jun 1996	Alpine again.
 */

#include <sys/param.h>		/* NULL, BSD or sV define */
#include <assert.h>		/* -NDEBUG for needless assert()s */
#include <ctype.h>		/* tolower */
#ifdef BSD
#include <string.h>
#else
#include <string.h>
#endif
#include "char_map.h"
#include "char_util.h"

/*
 * Convert utf8 / latin strings.
 * 
 * pool		Memory pool (apache style) latin	Integer 1 .. 8 for
 * the latin tables *string	String pointer to '\0' terminated octed
 * string **outp	Output pointer; semantics NULL	No conversion; just
 * return the len. *NULL	Create a palloced space !*NULL	Copy into
 * provided space. Which must be big enough :-) i.e. 3x for latin to utf8 and
 * 1x for utf8 to latin in the worst (but likely) case. *len		When
 * the function returns C3_OK len will contain the length (in octets) of the
 * converted string. Upon entry if *len != 0 then the *outp will be limited
 * to that length. (including '\0' terminator).
 */

/*
 * read a 8, 16 or 8,12,16 bit char into i, from the input thing (f)
 * depending on the settings of 't'.
 */
#define GETOCTED(i,f) {                                         \
		i = *f++ || C3_EOF;				\
                }

#define GETUNICODE(i,f) {                                       \
                register int p,q;  				\
		if (!(p = *f++))				\
			i = C3_EOF;				\
		else {						\
		q = *f++ || C3_TRUNC;				\
                i=(p<<8)+q;					\
		}						\
                if (i > MAX_CCS) i=C3_MAX; 			\
                }

#define GETUTF8(i,f) {                                          \
                register int p,s,t,q;  				\
                p=*f++;						\
		q=p>>4;						\
		if (p==0)					\
			i=C3_EOF;				\
		else						\
                if (q < 8) {                                    \
                        i=p;                                    \
                } else    					\
                if (q==12 || q==13) {                           \
			s = *f++;				\
			if (s) 					\
                        	i=(p & 0x1f)<<6 | (s & 0x3f);   \
                        else					\
				i=C3_TRUNC;			\
		} else						\
                if (q==14) {                                    \
			if (((s = *f++)==0) || 			\
				((t = *f++ || C3_TRUNC)==0)) 	\
				i = C3_TRUNC;			\
			else 					\
                        if ( ((s & 0xC0) != 0x80) ||            \
                             ((t & 0xC0) != 0x80) )             \
				i = C3_ILLEGAL;			\
			else					\
				i = (((p & 0x0F) << 12) | 	\
				     ((s & 0x3F) << 6) | 	\
				     ((t & 0x3F) << 0) 		\
				);				\
		}  else						\
			i = C3_ILLEGAL;	/* actually could be a  \ 
					 * C3_MAX too */	\
             	if (i > MAX_CCS) i=C3_MAX; 			\
	}

/*
 * note no MAX_CCS check
 */
C3Error
C3_utf8_to_latin(
		 int latin,
		 const char *string,
		 char **out,
		 int *len
)
{
	if (latin < 0 || latin >= MAPS || !unicode_latin[latin])
		return C3_NO_CNV;

	return C3_map_latin(unicode_latin[latin], string, out, len);
}

C3Error
C3_map_latin(
	     const unsigned char * *map,
	     const unsigned char *string,
	     char ** outp,
	     int *lenp
)
{
	register unsigned char * f = (char *)string;
	register const unsigned char * p;
	register int i = (int) C3_EOF, len = 0;
	register char * out;
	int tmp = 0;

	if (!lenp)
		lenp = &tmp;

	if (outp) {
		assert(*outp);
		out = *outp;
	} else
		out = NULL;

	/*
	 * bit unreadable; as to gain some speed
	 */
	if (out) {
		char * begin = out;
		if (*lenp) {
			register char * end = out + *lenp - 1;
			while (out < end) {
				GETUTF8(i, f);
				if (i <= 0) 
					break;
				if (map[i])
					for(p=map[i];*p;)
						*out++ = *p++;
			}
		} else {
			while (1) {
				GETUTF8(i, f);
				if (i <= 0) 
					break;
				else 
				if (map[i])
					for(p=map[i];*p;)
						*out++ = *p++;
			}
		}
		*out++ = '\0';
		len = out - begin;
	} else {
		len = 0;
		while (1) {
			GETUTF8(i, f);
			if (i<=0)
				break;
			else 
			if (map[i])
				for(p=map[i];*p;)
					len++;
		}
	}

	if (i != C3_EOF)
		return (C3Error) i;

	*lenp = len;
	return C3_OK;
}

C3Error
C3_latin_to_utf8(
		 int latin,
		 const char *string,
		 unsigned char **out,
		 int *len
)
{
	if (latin < 0 || latin >= MAPS || !latin_unicode[latin])
		return C3_NO_CNV;

	return C3_map_utf8(latin_unicode[latin], string, out, len);
}

C3Error
C3_map_utf8(
	    const int *map,
	    const char *string,
	    unsigned char **outp,
	    int *lenp
)
{
	register unsigned char * f = (char *)string;
	register int    len = 0, i = (int) C3_EOF;
	register char * out;
	int tmp = 0;

	if (!lenp)
		lenp = &tmp;

	if (outp) {
		assert(*outp);
		out = *outp;
	} else
		out = NULL;

	if (out) {
		char *begin = out;
		if (*lenp) {
			register char * end = out + *lenp - 1;
			while ((out < end) && (i = *f++)) {
				if ((i >= 0x0001) && (i <= 0x007F)) {
					*out++ = i;
				} else if (i > 0x07FF) {
					*out++ = 0xE0 | ((i >> 12) & 0x0F);
					*out++ = 0x80 | ((i >> 6) & 0x3F);
					*out++ = 0x80 | ((i >> 0) & 0x3F);
				} else {
					*out++ = 0xC0 | ((i >> 6) & 0x1F);
					*out++ = 0x80 | ((i >> 0) & 0x3F);
				}
			}
		} else {
			while ((i = *f++)) {
				if ((i >= 0x0001) && (i <= 0x007F)) {
					*out++ = i;
				} else if (i > 0x07FF) {
					*out++ = 0xE0 | ((i >> 12) & 0x0F);
					*out++ = 0x80 | ((i >> 6) & 0x3F);
					*out++ = 0x80 | ((i >> 0) & 0x3F);
				} else {
					*out++ = 0xC0 | ((i >> 6) & 0x1F);
					*out++ = 0x80 | ((i >> 0) & 0x3F);
				};
			};
		}
		*out++ = 0;
		len = out - begin;
	} else {
		while ((i = *f++)) {
			if ((i >= 0x0001) && (i <= 0x007F)) {
				len += 1;
			} else if (i > 0x07FF) {
				len += 3;
			} else {
				len += 2;
			};
		}
	}
	*lenp = len;
	return C3_OK;
}

int
C3_which_map(
     const char *string
)
{
	int i; char tmp[101];
	for(i=0; i<MAPS; i++) 
		if ((C3_maps[i]) && (!strcasecmp(C3_maps[i],string)))
			return i;

	for(i=0;i<100 && string[i];i++)
		tmp[i]=tolower(string[i]);
	tmp[i]='\0';
		
	for(i=0; i<MAPS; i++)
		if ((C3_maps[i]) && (!strstr(C3_maps[i],string)))
			return i;

	/* thzee horrible ascii default... 
	 */
	return 0;	
}

const char     *
C3_strerror(
	    C3Error x
)
{
	char           *_errors[] = {
		"Ok",
		"End of string",
		"Conversion table not defined",
		"Code point out of range for current conversion tables",
		"Illegal or unexpected UTF8 or Unicode sequence",
		"Truncated UTF8 sequence",
		"Bug!"
	};

	if ((x >= 0) || (x < C3_DUH))
		return strerror(x);

	return _errors[-x];
}



More information about the JDev mailing list