[JDEV] Question on how to encode unicode into utf8 for jabber
Dirk-Willem van Gulik
dirkx at covalent.net
Thu Oct 18 13:20:18 CDT 2001
On Thu, 18 Oct 2001, David Rainville wrote:
> Hi Everyone,
> I tried to encode the Unicode character set to fit in the utf8 caracter
> set.. I encoded every caracter in this structure : \uxxxx where x are
> hexadecimal digits It works on my client because I decode it in a way that
> it converts the \uxxxx to the unicode caracter. . Is it the way to do it?
> Will every other client have this as a standard?
No. UTF does not quite work that way. You may want to get yourself a copy
of the Unicode Standard (www.unicode.com or amazon.com :-). See attached
little routines.
Dw
-------------- next part --------------
/*
* ====================================================================
* Copyright (c) 1999 Dirk-Willem van Gulik - WebWeaving m/v
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are
* met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. All advertising materials mentioning features or use of this software must
* display the following acknowledgment: "This product includes software
* developed by WebWeaving Consulancy (http://www.webweaving.org).
*
* 4. The name "WebWeaving", must not be used to endorse or promote products
* derived from this software without prior written permission. For written
* permission, please contact dirkx at webweaving.org.
*
* 5. Redistributions of any form whatsoever must retain the following
* acknowledgment: "This product includes software developed by WebWeaving
* for use in the Apache HTTP server project (http://www.apache.org/)."
*
* THIS SOFTWARE IS PROVIDED BY WEBWEAVING AND AFFILIATES ``AS IS'' AND ANY
* EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL WEBWEAVING OR ITS AFFILIATES OR ITS
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
* ====================================================================
*
* Simple UTF8 / Unicode / LatinX conversion utilities. Note that these are
* incomplete and ONLY do the first few tens of codepages. The higher ones
* (i.e. the full 32 bits) are left as an excersize to the reader.. and have
* not been defined anyway at the time of writing this...
*
* Version 0.00 Winter 1997 First version 0.01 Fall 1991 minor speed
* ups. 0.02 May 1995 Alpine; apache pool 0.04 Jun 1996 Alpine again.
*/
#include <sys/param.h> /* NULL, BSD or sV define */
#include <assert.h> /* -NDEBUG for needless assert()s */
#include <ctype.h> /* tolower */
#ifdef BSD
#include <string.h>
#else
#include <string.h>
#endif
#include "char_map.h"
#include "char_util.h"
/*
* Convert utf8 / latin strings.
*
* pool Memory pool (apache style) latin Integer 1 .. 8 for
* the latin tables *string String pointer to '\0' terminated octed
* string **outp Output pointer; semantics NULL No conversion; just
* return the len. *NULL Create a palloced space !*NULL Copy into
* provided space. Which must be big enough :-) i.e. 3x for latin to utf8 and
* 1x for utf8 to latin in the worst (but likely) case. *len When
* the function returns C3_OK len will contain the length (in octets) of the
* converted string. Upon entry if *len != 0 then the *outp will be limited
* to that length. (including '\0' terminator).
*/
/*
* read a 8, 16 or 8,12,16 bit char into i, from the input thing (f)
* depending on the settings of 't'.
*/
#define GETOCTED(i,f) { \
i = *f++ || C3_EOF; \
}
#define GETUNICODE(i,f) { \
register int p,q; \
if (!(p = *f++)) \
i = C3_EOF; \
else { \
q = *f++ || C3_TRUNC; \
i=(p<<8)+q; \
} \
if (i > MAX_CCS) i=C3_MAX; \
}
#define GETUTF8(i,f) { \
register int p,s,t,q; \
p=*f++; \
q=p>>4; \
if (p==0) \
i=C3_EOF; \
else \
if (q < 8) { \
i=p; \
} else \
if (q==12 || q==13) { \
s = *f++; \
if (s) \
i=(p & 0x1f)<<6 | (s & 0x3f); \
else \
i=C3_TRUNC; \
} else \
if (q==14) { \
if (((s = *f++)==0) || \
((t = *f++ || C3_TRUNC)==0)) \
i = C3_TRUNC; \
else \
if ( ((s & 0xC0) != 0x80) || \
((t & 0xC0) != 0x80) ) \
i = C3_ILLEGAL; \
else \
i = (((p & 0x0F) << 12) | \
((s & 0x3F) << 6) | \
((t & 0x3F) << 0) \
); \
} else \
i = C3_ILLEGAL; /* actually could be a \
* C3_MAX too */ \
if (i > MAX_CCS) i=C3_MAX; \
}
/*
* note no MAX_CCS check
*/
C3Error
C3_utf8_to_latin(
int latin,
const char *string,
char **out,
int *len
)
{
if (latin < 0 || latin >= MAPS || !unicode_latin[latin])
return C3_NO_CNV;
return C3_map_latin(unicode_latin[latin], string, out, len);
}
C3Error
C3_map_latin(
const unsigned char * *map,
const unsigned char *string,
char ** outp,
int *lenp
)
{
register unsigned char * f = (char *)string;
register const unsigned char * p;
register int i = (int) C3_EOF, len = 0;
register char * out;
int tmp = 0;
if (!lenp)
lenp = &tmp;
if (outp) {
assert(*outp);
out = *outp;
} else
out = NULL;
/*
* bit unreadable; as to gain some speed
*/
if (out) {
char * begin = out;
if (*lenp) {
register char * end = out + *lenp - 1;
while (out < end) {
GETUTF8(i, f);
if (i <= 0)
break;
if (map[i])
for(p=map[i];*p;)
*out++ = *p++;
}
} else {
while (1) {
GETUTF8(i, f);
if (i <= 0)
break;
else
if (map[i])
for(p=map[i];*p;)
*out++ = *p++;
}
}
*out++ = '\0';
len = out - begin;
} else {
len = 0;
while (1) {
GETUTF8(i, f);
if (i<=0)
break;
else
if (map[i])
for(p=map[i];*p;)
len++;
}
}
if (i != C3_EOF)
return (C3Error) i;
*lenp = len;
return C3_OK;
}
C3Error
C3_latin_to_utf8(
int latin,
const char *string,
unsigned char **out,
int *len
)
{
if (latin < 0 || latin >= MAPS || !latin_unicode[latin])
return C3_NO_CNV;
return C3_map_utf8(latin_unicode[latin], string, out, len);
}
C3Error
C3_map_utf8(
const int *map,
const char *string,
unsigned char **outp,
int *lenp
)
{
register unsigned char * f = (char *)string;
register int len = 0, i = (int) C3_EOF;
register char * out;
int tmp = 0;
if (!lenp)
lenp = &tmp;
if (outp) {
assert(*outp);
out = *outp;
} else
out = NULL;
if (out) {
char *begin = out;
if (*lenp) {
register char * end = out + *lenp - 1;
while ((out < end) && (i = *f++)) {
if ((i >= 0x0001) && (i <= 0x007F)) {
*out++ = i;
} else if (i > 0x07FF) {
*out++ = 0xE0 | ((i >> 12) & 0x0F);
*out++ = 0x80 | ((i >> 6) & 0x3F);
*out++ = 0x80 | ((i >> 0) & 0x3F);
} else {
*out++ = 0xC0 | ((i >> 6) & 0x1F);
*out++ = 0x80 | ((i >> 0) & 0x3F);
}
}
} else {
while ((i = *f++)) {
if ((i >= 0x0001) && (i <= 0x007F)) {
*out++ = i;
} else if (i > 0x07FF) {
*out++ = 0xE0 | ((i >> 12) & 0x0F);
*out++ = 0x80 | ((i >> 6) & 0x3F);
*out++ = 0x80 | ((i >> 0) & 0x3F);
} else {
*out++ = 0xC0 | ((i >> 6) & 0x1F);
*out++ = 0x80 | ((i >> 0) & 0x3F);
};
};
}
*out++ = 0;
len = out - begin;
} else {
while ((i = *f++)) {
if ((i >= 0x0001) && (i <= 0x007F)) {
len += 1;
} else if (i > 0x07FF) {
len += 3;
} else {
len += 2;
};
}
}
*lenp = len;
return C3_OK;
}
int
C3_which_map(
const char *string
)
{
int i; char tmp[101];
for(i=0; i<MAPS; i++)
if ((C3_maps[i]) && (!strcasecmp(C3_maps[i],string)))
return i;
for(i=0;i<100 && string[i];i++)
tmp[i]=tolower(string[i]);
tmp[i]='\0';
for(i=0; i<MAPS; i++)
if ((C3_maps[i]) && (!strstr(C3_maps[i],string)))
return i;
/* thzee horrible ascii default...
*/
return 0;
}
const char *
C3_strerror(
C3Error x
)
{
char *_errors[] = {
"Ok",
"End of string",
"Conversion table not defined",
"Code point out of range for current conversion tables",
"Illegal or unexpected UTF8 or Unicode sequence",
"Truncated UTF8 sequence",
"Bug!"
};
if ((x >= 0) || (x < C3_DUH))
return strerror(x);
return _errors[-x];
}
More information about the JDev
mailing list