| /* |
| * encoding.c : implements the encoding conversion functions needed for XML |
| * |
| * Related specs: |
| * rfc2044 (UTF-8 and UTF-16) F. Yergeau Alis Technologies |
| * rfc2781 UTF-16, an encoding of ISO 10646, P. Hoffman, F. Yergeau |
| * [ISO-10646] UTF-8 and UTF-16 in Annexes |
| * [ISO-8859-1] ISO Latin-1 characters codes. |
| * [UNICODE] The Unicode Consortium, "The Unicode Standard -- |
| * Worldwide Character Encoding -- Version 1.0", Addison- |
| * Wesley, Volume 1, 1991, Volume 2, 1992. UTF-8 is |
| * described in Unicode Technical Report #4. |
| * [US-ASCII] Coded Character Set--7-bit American Standard Code for |
| * Information Interchange, ANSI X3.4-1986. |
| * |
| * See Copyright for the status of this software. |
| * |
| * daniel@veillard.com |
| * |
| * UTF8 string routines from: |
| * "William M. Brack" <wbrack@mmm.com.hk> |
| * |
| * Original code for IsoLatin1 and UTF-16 by "Martin J. Duerst" <duerst@w3.org> |
| */ |
| |
| #define IN_LIBXML |
| #include "libxml.h" |
| |
| #include <string.h> |
| |
| #ifdef HAVE_CTYPE_H |
| #include <ctype.h> |
| #endif |
| #ifdef HAVE_STDLIB_H |
| #include <stdlib.h> |
| #endif |
| #ifdef LIBXML_ICONV_ENABLED |
| #ifdef HAVE_ERRNO_H |
| #include <errno.h> |
| #endif |
| #endif |
| #include <libxml/encoding.h> |
| #include <libxml/xmlmemory.h> |
| #ifdef LIBXML_HTML_ENABLED |
| #include <libxml/HTMLparser.h> |
| #endif |
| #include <libxml/globals.h> |
| #include <libxml/xmlerror.h> |
| |
| static xmlCharEncodingHandlerPtr xmlUTF16LEHandler = NULL; |
| static xmlCharEncodingHandlerPtr xmlUTF16BEHandler = NULL; |
| |
| typedef struct _xmlCharEncodingAlias xmlCharEncodingAlias; |
| typedef xmlCharEncodingAlias *xmlCharEncodingAliasPtr; |
| struct _xmlCharEncodingAlias { |
| const char *name; |
| const char *alias; |
| }; |
| |
| static xmlCharEncodingAliasPtr xmlCharEncodingAliases = NULL; |
| static int xmlCharEncodingAliasesNb = 0; |
| static int xmlCharEncodingAliasesMax = 0; |
| |
| #ifdef LIBXML_ICONV_ENABLED |
| #if 0 |
| #define DEBUG_ENCODING /* Define this to get encoding traces */ |
| #endif |
| #else |
| #ifdef LIBXML_ISO8859X_ENABLED |
| static void xmlRegisterCharEncodingHandlersISO8859x (void); |
| #endif |
| #endif |
| |
| static int xmlLittleEndian = 1; |
| |
| /************************************************************************ |
| * * |
| * Generic UTF8 handling routines * |
| * * |
| * From rfc2044: encoding of the Unicode values on UTF-8: * |
| * * |
| * UCS-4 range (hex.) UTF-8 octet sequence (binary) * |
| * 0000 0000-0000 007F 0xxxxxxx * |
| * 0000 0080-0000 07FF 110xxxxx 10xxxxxx * |
| * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx * |
| * * |
| * I hope we won't use values > 0xFFFF anytime soon ! * |
| * * |
| ************************************************************************/ |
| |
| /** |
| * xmlUTF8Size: |
| * @utf: pointer to the UTF8 character |
| * |
| * calculates the internal size of a UTF8 character |
| * |
| * returns the numbers of bytes in the character, -1 on format error |
| */ |
| int |
| xmlUTF8Size(const xmlChar *utf) { |
| xmlChar mask; |
| int len; |
| |
| if (utf == NULL) |
| return -1; |
| if (*utf < 0x80) |
| return 1; |
| /* check valid UTF8 character */ |
| if (!(*utf & 0x40)) |
| return -1; |
| /* determine number of bytes in char */ |
| len = 2; |
| for (mask=0x20; mask != 0; mask>>=1) { |
| if (!(*utf & mask)) |
| return len; |
| len++; |
| } |
| return -1; |
| } |
| |
| /** |
| * xmlUTF8Charcmp: |
| * @utf1: pointer to first UTF8 char |
| * @utf2: pointer to second UTF8 char |
| * |
| * compares the two UCS4 values |
| * |
| * returns result of the compare as with xmlStrncmp |
| */ |
| int |
| xmlUTF8Charcmp(const xmlChar *utf1, const xmlChar *utf2) { |
| |
| if (utf1 == NULL ) { |
| if (utf2 == NULL) |
| return 0; |
| return -1; |
| } |
| return xmlStrncmp(utf1, utf2, xmlUTF8Size(utf1)); |
| } |
| |
| /** |
| * xmlUTF8Strlen: |
| * @utf: a sequence of UTF-8 encoded bytes |
| * |
| * compute the length of an UTF8 string, it doesn't do a full UTF8 |
| * checking of the content of the string. |
| * |
| * Returns the number of characters in the string or -1 in case of error |
| */ |
| int |
| xmlUTF8Strlen(const xmlChar *utf) { |
| int ret = 0; |
| |
| if (utf == NULL) |
| return(-1); |
| |
| while (*utf != 0) { |
| if (utf[0] & 0x80) { |
| if ((utf[1] & 0xc0) != 0x80) |
| return(-1); |
| if ((utf[0] & 0xe0) == 0xe0) { |
| if ((utf[2] & 0xc0) != 0x80) |
| return(-1); |
| if ((utf[0] & 0xf0) == 0xf0) { |
| if ((utf[0] & 0xf8) != 0xf0 || (utf[3] & 0xc0) != 0x80) |
| return(-1); |
| utf += 4; |
| } else { |
| utf += 3; |
| } |
| } else { |
| utf += 2; |
| } |
| } else { |
| utf++; |
| } |
| ret++; |
| } |
| return(ret); |
| } |
| |
| /** |
| * xmlGetUTF8Char: |
| * @utf: a sequence of UTF-8 encoded bytes |
| * @len: a pointer to @bytes len |
| * |
| * Read one UTF8 Char from @utf |
| * |
| * Returns the char value or -1 in case of error, and updates *len with the |
| * number of bytes consumed |
| */ |
| int |
| xmlGetUTF8Char(const unsigned char *utf, int *len) { |
| unsigned int c; |
| |
| if (utf == NULL) |
| goto error; |
| if (len == NULL) |
| goto error; |
| if (*len < 1) |
| goto error; |
| |
| c = utf[0]; |
| if (c & 0x80) { |
| if (*len < 2) |
| goto error; |
| if ((utf[1] & 0xc0) != 0x80) |
| goto error; |
| if ((c & 0xe0) == 0xe0) { |
| if (*len < 3) |
| goto error; |
| if ((utf[2] & 0xc0) != 0x80) |
| goto error; |
| if ((c & 0xf0) == 0xf0) { |
| if (*len < 4) |
| goto error; |
| if ((c & 0xf8) != 0xf0 || (utf[3] & 0xc0) != 0x80) |
| goto error; |
| *len = 4; |
| /* 4-byte code */ |
| c = (utf[0] & 0x7) << 18; |
| c |= (utf[1] & 0x3f) << 12; |
| c |= (utf[2] & 0x3f) << 6; |
| c |= utf[3] & 0x3f; |
| } else { |
| /* 3-byte code */ |
| *len = 3; |
| c = (utf[0] & 0xf) << 12; |
| c |= (utf[1] & 0x3f) << 6; |
| c |= utf[2] & 0x3f; |
| } |
| } else { |
| /* 2-byte code */ |
| *len = 2; |
| c = (utf[0] & 0x1f) << 6; |
| c |= utf[1] & 0x3f; |
| } |
| } else { |
| /* 1-byte code */ |
| *len = 1; |
| } |
| return(c); |
| |
| error: |
| *len = 0; |
| return(-1); |
| } |
| |
| /** |
| * xmlCheckUTF8: |
| * @utf: Pointer to putative UTF-8 encoded string. |
| * |
| * Checks @utf for being valid UTF-8. @utf is assumed to be |
| * null-terminated. This function is not super-strict, as it will |
| * allow longer UTF-8 sequences than necessary. Note that Java is |
| * capable of producing these sequences if provoked. Also note, this |
| * routine checks for the 4-byte maximum size, but does not check for |
| * 0x10ffff maximum value. |
| * |
| * Return value: true if @utf is valid. |
| **/ |
| int |
| xmlCheckUTF8(const unsigned char *utf) |
| { |
| int ix; |
| unsigned char c; |
| |
| for (ix = 0; (c = utf[ix]);) { |
| if (c & 0x80) { |
| if ((utf[ix + 1] & 0xc0) != 0x80) |
| return(0); |
| if ((c & 0xe0) == 0xe0) { |
| if ((utf[ix + 2] & 0xc0) != 0x80) |
| return(0); |
| if ((c & 0xf0) == 0xf0) { |
| if ((c & 0xf8) != 0xf0 || (utf[ix + 3] & 0xc0) != 0x80) |
| return(0); |
| ix += 4; |
| /* 4-byte code */ |
| } else |
| /* 3-byte code */ |
| ix += 3; |
| } else |
| /* 2-byte code */ |
| ix += 2; |
| } else |
| /* 1-byte code */ |
| ix++; |
| } |
| return(1); |
| } |
| |
| /** |
| * xmlUTF8Strsize: |
| * @utf: a sequence of UTF-8 encoded bytes |
| * @len: the number of characters in the array |
| * |
| * storage size of an UTF8 string |
| * |
| * Returns the storage size of |
| * the first 'len' characters of ARRAY |
| * |
| */ |
| |
| int |
| xmlUTF8Strsize(const xmlChar *utf, int len) { |
| const xmlChar *ptr=utf; |
| xmlChar ch; |
| |
| if (len <= 0) |
| return(0); |
| |
| while ( len-- > 0) { |
| if ( !*ptr ) |
| break; |
| if ( (ch = *ptr++) & 0x80) |
| while ( (ch<<=1) & 0x80 ) |
| ptr++; |
| } |
| return (ptr - utf); |
| } |
| |
| |
| /** |
| * xmlUTF8Strndup: |
| * @utf: the input UTF8 * |
| * @len: the len of @utf (in chars) |
| * |
| * a strndup for array of UTF8's |
| * |
| * Returns a new UTF8 * or NULL |
| */ |
| xmlChar * |
| xmlUTF8Strndup(const xmlChar *utf, int len) { |
| xmlChar *ret; |
| int i; |
| |
| if ((utf == NULL) || (len < 0)) return(NULL); |
| i = xmlUTF8Strsize(utf, len); |
| ret = (xmlChar *) xmlMallocAtomic((i + 1) * sizeof(xmlChar)); |
| if (ret == NULL) { |
| xmlGenericError(xmlGenericErrorContext, |
| "malloc of %ld byte failed\n", |
| (len + 1) * (long)sizeof(xmlChar)); |
| return(NULL); |
| } |
| memcpy(ret, utf, i * sizeof(xmlChar)); |
| ret[i] = 0; |
| return(ret); |
| } |
| |
| /** |
| * xmlUTF8Strpos: |
| * @utf: the input UTF8 * |
| * @pos: the position of the desired UTF8 char (in chars) |
| * |
| * a function to provide the equivalent of fetching a |
| * character from a string array |
| * |
| * Returns a pointer to the UTF8 character or NULL |
| */ |
| xmlChar * |
| xmlUTF8Strpos(const xmlChar *utf, int pos) { |
| xmlChar ch; |
| |
| if (utf == NULL) return(NULL); |
| if ( (pos < 0) || (pos >= xmlUTF8Strlen(utf)) ) |
| return(NULL); |
| while (pos--) { |
| if ((ch=*utf++) == 0) return(NULL); |
| if ( ch & 0x80 ) { |
| /* if not simple ascii, verify proper format */ |
| if ( (ch & 0xc0) != 0xc0 ) |
| return(NULL); |
| /* then skip over remaining bytes for this char */ |
| while ( (ch <<= 1) & 0x80 ) |
| if ( (*utf++ & 0xc0) != 0x80 ) |
| return(NULL); |
| } |
| } |
| return((xmlChar *)utf); |
| } |
| |
| /** |
| * xmlUTF8Strloc: |
| * @utf: the input UTF8 * |
| * @utfchar: the UTF8 character to be found |
| * |
| * a function to provide the relative location of a UTF8 char |
| * |
| * Returns the relative character position of the desired char |
| * or -1 if not found |
| */ |
| int |
| xmlUTF8Strloc(const xmlChar *utf, const xmlChar *utfchar) { |
| int i, size; |
| xmlChar ch; |
| |
| if (utf==NULL || utfchar==NULL) return -1; |
| size = xmlUTF8Strsize(utfchar, 1); |
| for(i=0; (ch=*utf) != 0; i++) { |
| if (xmlStrncmp(utf, utfchar, size)==0) |
| return(i); |
| utf++; |
| if ( ch & 0x80 ) { |
| /* if not simple ascii, verify proper format */ |
| if ( (ch & 0xc0) != 0xc0 ) |
| return(-1); |
| /* then skip over remaining bytes for this char */ |
| while ( (ch <<= 1) & 0x80 ) |
| if ( (*utf++ & 0xc0) != 0x80 ) |
| return(-1); |
| } |
| } |
| |
| return(-1); |
| } |
| /** |
| * xmlUTF8Strsub: |
| * @utf: a sequence of UTF-8 encoded bytes |
| * @start: relative pos of first char |
| * @len: total number to copy |
| * |
| * Create a substring from a given UTF-8 string |
| * Note: positions are given in units of UTF-8 chars |
| * |
| * Returns a pointer to a newly created string |
| * or NULL if any problem |
| */ |
| |
| xmlChar * |
| xmlUTF8Strsub(const xmlChar *utf, int start, int len) { |
| int i; |
| xmlChar ch; |
| |
| if (utf == NULL) return(NULL); |
| if (start < 0) return(NULL); |
| if (len < 0) return(NULL); |
| |
| /* |
| * Skip over any leading chars |
| */ |
| for (i = 0;i < start;i++) { |
| if ((ch=*utf++) == 0) return(NULL); |
| if ( ch & 0x80 ) { |
| /* if not simple ascii, verify proper format */ |
| if ( (ch & 0xc0) != 0xc0 ) |
| return(NULL); |
| /* then skip over remaining bytes for this char */ |
| while ( (ch <<= 1) & 0x80 ) |
| if ( (*utf++ & 0xc0) != 0x80 ) |
| return(NULL); |
| } |
| } |
| |
| return(xmlUTF8Strndup(utf, len)); |
| } |
| |
| /************************************************************************ |
| * * |
| * Conversions To/From UTF8 encoding * |
| * * |
| ************************************************************************/ |
| |
| /** |
| * asciiToUTF8: |
| * @out: a pointer to an array of bytes to store the result |
| * @outlen: the length of @out |
| * @in: a pointer to an array of ASCII chars |
| * @inlen: the length of @in |
| * |
| * Take a block of ASCII chars in and try to convert it to an UTF-8 |
| * block of chars out. |
| * Returns 0 if success, or -1 otherwise |
| * The value of @inlen after return is the number of octets consumed |
| * if the return value is positive, else unpredictable. |
| * The value of @outlen after return is the number of octets consumed. |
| */ |
| static int |
| asciiToUTF8(unsigned char* out, int *outlen, |
| const unsigned char* in, int *inlen) { |
| unsigned char* outstart = out; |
| const unsigned char* base = in; |
| const unsigned char* processed = in; |
| unsigned char* outend = out + *outlen; |
| const unsigned char* inend; |
| unsigned int c; |
| int bits; |
| |
| inend = in + (*inlen); |
| while ((in < inend) && (out - outstart + 5 < *outlen)) { |
| c= *in++; |
| |
| /* assertion: c is a single UTF-4 value */ |
| if (out >= outend) |
| break; |
| if (c < 0x80) { *out++= c; bits= -6; } |
| else { |
| *outlen = out - outstart; |
| *inlen = processed - base; |
| return(-1); |
| } |
| |
| for ( ; bits >= 0; bits-= 6) { |
| if (out >= outend) |
| break; |
| *out++= ((c >> bits) & 0x3F) | 0x80; |
| } |
| processed = (const unsigned char*) in; |
| } |
| *outlen = out - outstart; |
| *inlen = processed - base; |
| return(0); |
| } |
| |
| #ifdef LIBXML_OUTPUT_ENABLED |
| /** |
| * UTF8Toascii: |
| * @out: a pointer to an array of bytes to store the result |
| * @outlen: the length of @out |
| * @in: a pointer to an array of UTF-8 chars |
| * @inlen: the length of @in |
| * |
| * Take a block of UTF-8 chars in and try to convert it to an ASCII |
| * block of chars out. |
| * |
| * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise |
| * The value of @inlen after return is the number of octets consumed |
| * if the return value is positive, else unpredictable. |
| * The value of @outlen after return is the number of octets consumed. |
| */ |
| static int |
| UTF8Toascii(unsigned char* out, int *outlen, |
| const unsigned char* in, int *inlen) { |
| const unsigned char* processed = in; |
| const unsigned char* outend; |
| const unsigned char* outstart = out; |
| const unsigned char* instart = in; |
| const unsigned char* inend; |
| unsigned int c, d; |
| int trailing; |
| |
| if (in == NULL) { |
| /* |
| * initialization nothing to do |
| */ |
| *outlen = 0; |
| *inlen = 0; |
| return(0); |
| } |
| inend = in + (*inlen); |
| outend = out + (*outlen); |
| while (in < inend) { |
| d = *in++; |
| if (d < 0x80) { c= d; trailing= 0; } |
| else if (d < 0xC0) { |
| /* trailing byte in leading position */ |
| *outlen = out - outstart; |
| *inlen = processed - instart; |
| return(-2); |
| } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; } |
| else if (d < 0xF0) { c= d & 0x0F; trailing= 2; } |
| else if (d < 0xF8) { c= d & 0x07; trailing= 3; } |
| else { |
| /* no chance for this in Ascii */ |
| *outlen = out - outstart; |
| *inlen = processed - instart; |
| return(-2); |
| } |
| |
| if (inend - in < trailing) { |
| break; |
| } |
| |
| for ( ; trailing; trailing--) { |
| if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80)) |
| break; |
| c <<= 6; |
| c |= d & 0x3F; |
| } |
| |
| /* assertion: c is a single UTF-4 value */ |
| if (c < 0x80) { |
| if (out >= outend) |
| break; |
| *out++ = c; |
| } else { |
| /* no chance for this in Ascii */ |
| *outlen = out - outstart; |
| *inlen = processed - instart; |
| return(-2); |
| } |
| processed = in; |
| } |
| *outlen = out - outstart; |
| *inlen = processed - instart; |
| return(0); |
| } |
| #endif /* LIBXML_OUTPUT_ENABLED */ |
| |
| /** |
| * isolat1ToUTF8: |
| * @out: a pointer to an array of bytes to store the result |
| * @outlen: the length of @out |
| * @in: a pointer to an array of ISO Latin 1 chars |
| * @inlen: the length of @in |
| * |
| * Take a block of ISO Latin 1 chars in and try to convert it to an UTF-8 |
| * block of chars out. |
| * Returns 0 if success, or -1 otherwise |
| * The value of @inlen after return is the number of octets consumed |
| * if the return value is positive, else unpredictable. |
| * The value of @outlen after return is the number of octets consumed. |
| */ |
| int |
| isolat1ToUTF8(unsigned char* out, int *outlen, |
| const unsigned char* in, int *inlen) { |
| unsigned char* outstart = out; |
| const unsigned char* base = in; |
| unsigned char* outend = out + *outlen; |
| const unsigned char* inend; |
| const unsigned char* instop; |
| xmlChar c = *in; |
| |
| inend = in + (*inlen); |
| instop = inend; |
| |
| while (in < inend && out < outend - 1) { |
| if (c >= 0x80) { |
| *out++= ((c >> 6) & 0x1F) | 0xC0; |
| *out++= (c & 0x3F) | 0x80; |
| ++in; |
| c = *in; |
| } |
| if (instop - in > outend - out) instop = in + (outend - out); |
| while (c < 0x80 && in < instop) { |
| *out++ = c; |
| ++in; |
| c = *in; |
| } |
| } |
| if (in < inend && out < outend && c < 0x80) { |
| *out++ = c; |
| ++in; |
| } |
| *outlen = out - outstart; |
| *inlen = in - base; |
| return(0); |
| } |
| |
| /** |
| * UTF8ToUTF8: |
| * @out: a pointer to an array of bytes to store the result |
| * @outlen: the length of @out |
| * @inb: a pointer to an array of UTF-8 chars |
| * @inlenb: the length of @in in UTF-8 chars |
| * |
| * No op copy operation for UTF8 handling. |
| * |
| * Returns the number of bytes written, or -1 if lack of space. |
| * The value of *inlen after return is the number of octets consumed |
| * if the return value is positive, else unpredictable. |
| */ |
| static int |
| UTF8ToUTF8(unsigned char* out, int *outlen, |
| const unsigned char* inb, int *inlenb) |
| { |
| int len; |
| |
| if ((out == NULL) || (inb == NULL) || (outlen == NULL) || (inlenb == NULL)) |
| return(-1); |
| if (*outlen > *inlenb) { |
| len = *inlenb; |
| } else { |
| len = *outlen; |
| } |
| if (len < 0) |
| return(-1); |
| |
| memcpy(out, inb, len); |
| |
| *outlen = len; |
| *inlenb = len; |
| return(0); |
| } |
| |
| |
| #ifdef LIBXML_OUTPUT_ENABLED |
| /** |
| * UTF8Toisolat1: |
| * @out: a pointer to an array of bytes to store the result |
| * @outlen: the length of @out |
| * @in: a pointer to an array of UTF-8 chars |
| * @inlen: the length of @in |
| * |
| * Take a block of UTF-8 chars in and try to convert it to an ISO Latin 1 |
| * block of chars out. |
| * |
| * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise |
| * The value of @inlen after return is the number of octets consumed |
| * if the return value is positive, else unpredictable. |
| * The value of @outlen after return is the number of octets consumed. |
| */ |
| int |
| UTF8Toisolat1(unsigned char* out, int *outlen, |
| const unsigned char* in, int *inlen) { |
| const unsigned char* processed = in; |
| const unsigned char* outend; |
| const unsigned char* outstart = out; |
| const unsigned char* instart = in; |
| const unsigned char* inend; |
| unsigned int c, d; |
| int trailing; |
| |
| if (in == NULL) { |
| /* |
| * initialization nothing to do |
| */ |
| *outlen = 0; |
| *inlen = 0; |
| return(0); |
| } |
| inend = in + (*inlen); |
| outend = out + (*outlen); |
| while (in < inend) { |
| d = *in++; |
| if (d < 0x80) { c= d; trailing= 0; } |
| else if (d < 0xC0) { |
| /* trailing byte in leading position */ |
| *outlen = out - outstart; |
| *inlen = processed - instart; |
| return(-2); |
| } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; } |
| else if (d < 0xF0) { c= d & 0x0F; trailing= 2; } |
| else if (d < 0xF8) { c= d & 0x07; trailing= 3; } |
| else { |
| /* no chance for this in IsoLat1 */ |
| *outlen = out - outstart; |
| *inlen = processed - instart; |
| return(-2); |
| } |
| |
| if (inend - in < trailing) { |
| break; |
| } |
| |
| for ( ; trailing; trailing--) { |
| if (in >= inend) |
| break; |
| if (((d= *in++) & 0xC0) != 0x80) { |
| *outlen = out - outstart; |
| *inlen = processed - instart; |
| return(-2); |
| } |
| c <<= 6; |
| c |= d & 0x3F; |
| } |
| |
| /* assertion: c is a single UTF-4 value */ |
| if (c <= 0xFF) { |
| if (out >= outend) |
| break; |
| *out++ = c; |
| } else { |
| /* no chance for this in IsoLat1 */ |
| *outlen = out - outstart; |
| *inlen = processed - instart; |
| return(-2); |
| } |
| processed = in; |
| } |
| *outlen = out - outstart; |
| *inlen = processed - instart; |
| return(0); |
| } |
| #endif /* LIBXML_OUTPUT_ENABLED */ |
| |
| /** |
| * UTF16LEToUTF8: |
| * @out: a pointer to an array of bytes to store the result |
| * @outlen: the length of @out |
| * @inb: a pointer to an array of UTF-16LE passwd as a byte array |
| * @inlenb: the length of @in in UTF-16LE chars |
| * |
| * Take a block of UTF-16LE ushorts in and try to convert it to an UTF-8 |
| * block of chars out. This function assumes the endian property |
| * is the same between the native type of this machine and the |
| * inputed one. |
| * |
| * Returns the number of bytes written, or -1 if lack of space, or -2 |
| * if the transcoding fails (if *in is not a valid utf16 string) |
| * The value of *inlen after return is the number of octets consumed |
| * if the return value is positive, else unpredictable. |
| */ |
| static int |
| UTF16LEToUTF8(unsigned char* out, int *outlen, |
| const unsigned char* inb, int *inlenb) |
| { |
| unsigned char* outstart = out; |
| const unsigned char* processed = inb; |
| unsigned char* outend = out + *outlen; |
| unsigned short* in = (unsigned short*) inb; |
| unsigned short* inend; |
| unsigned int c, d, inlen; |
| unsigned char *tmp; |
| int bits; |
| |
| if ((*inlenb % 2) == 1) |
| (*inlenb)--; |
| inlen = *inlenb / 2; |
| inend = in + inlen; |
| while ((in < inend) && (out - outstart + 5 < *outlen)) { |
| if (xmlLittleEndian) { |
| c= *in++; |
| } else { |
| tmp = (unsigned char *) in; |
| c = *tmp++; |
| c = c | (((unsigned int)*tmp) << 8); |
| in++; |
| } |
| if ((c & 0xFC00) == 0xD800) { /* surrogates */ |
| if (in >= inend) { /* (in > inend) shouldn't happens */ |
| break; |
| } |
| if (xmlLittleEndian) { |
| d = *in++; |
| } else { |
| tmp = (unsigned char *) in; |
| d = *tmp++; |
| d = d | (((unsigned int)*tmp) << 8); |
| in++; |
| } |
| if ((d & 0xFC00) == 0xDC00) { |
| c &= 0x03FF; |
| c <<= 10; |
| c |= d & 0x03FF; |
| c += 0x10000; |
| } |
| else { |
| *outlen = out - outstart; |
| *inlenb = processed - inb; |
| return(-2); |
| } |
| } |
| |
| /* assertion: c is a single UTF-4 value */ |
| if (out >= outend) |
| break; |
| if (c < 0x80) { *out++= c; bits= -6; } |
| else if (c < 0x800) { *out++= ((c >> 6) & 0x1F) | 0xC0; bits= 0; } |
| else if (c < 0x10000) { *out++= ((c >> 12) & 0x0F) | 0xE0; bits= 6; } |
| else { *out++= ((c >> 18) & 0x07) | 0xF0; bits= 12; } |
| |
| for ( ; bits >= 0; bits-= 6) { |
| if (out >= outend) |
| break; |
| *out++= ((c >> bits) & 0x3F) | 0x80; |
| } |
| processed = (const unsigned char*) in; |
| } |
| *outlen = out - outstart; |
| *inlenb = processed - inb; |
| return(0); |
| } |
| |
| #ifdef LIBXML_OUTPUT_ENABLED |
| /** |
| * UTF8ToUTF16LE: |
| * @outb: a pointer to an array of bytes to store the result |
| * @outlen: the length of @outb |
| * @in: a pointer to an array of UTF-8 chars |
| * @inlen: the length of @in |
| * |
| * Take a block of UTF-8 chars in and try to convert it to an UTF-16LE |
| * block of chars out. |
| * |
| * Returns the number of bytes written, or -1 if lack of space, or -2 |
| * if the transcoding failed. |
| */ |
| static int |
| UTF8ToUTF16LE(unsigned char* outb, int *outlen, |
| const unsigned char* in, int *inlen) |
| { |
| unsigned short* out = (unsigned short*) outb; |
| const unsigned char* processed = in; |
| const unsigned char *const instart = in; |
| unsigned short* outstart= out; |
| unsigned short* outend; |
| const unsigned char* inend= in+*inlen; |
| unsigned int c, d; |
| int trailing; |
| unsigned char *tmp; |
| unsigned short tmp1, tmp2; |
| |
| /* UTF16LE encoding has no BOM */ |
| if (in == NULL) { |
| *outlen = 0; |
| *inlen = 0; |
| return(0); |
| } |
| outend = out + (*outlen / 2); |
| while (in < inend) { |
| d= *in++; |
| if (d < 0x80) { c= d; trailing= 0; } |
| else if (d < 0xC0) { |
| /* trailing byte in leading position */ |
| *outlen = (out - outstart) * 2; |
| *inlen = processed - instart; |
| return(-2); |
| } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; } |
| else if (d < 0xF0) { c= d & 0x0F; trailing= 2; } |
| else if (d < 0xF8) { c= d & 0x07; trailing= 3; } |
| else { |
| /* no chance for this in UTF-16 */ |
| *outlen = (out - outstart) * 2; |
| *inlen = processed - instart; |
| return(-2); |
| } |
| |
| if (inend - in < trailing) { |
| break; |
| } |
| |
| for ( ; trailing; trailing--) { |
| if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80)) |
| break; |
| c <<= 6; |
| c |= d & 0x3F; |
| } |
| |
| /* assertion: c is a single UTF-4 value */ |
| if (c < 0x10000) { |
| if (out >= outend) |
| break; |
| if (xmlLittleEndian) { |
| *out++ = c; |
| } else { |
| tmp = (unsigned char *) out; |
| *tmp = c ; |
| *(tmp + 1) = c >> 8 ; |
| out++; |
| } |
| } |
| else if (c < 0x110000) { |
| if (out+1 >= outend) |
| break; |
| c -= 0x10000; |
| if (xmlLittleEndian) { |
| *out++ = 0xD800 | (c >> 10); |
| *out++ = 0xDC00 | (c & 0x03FF); |
| } else { |
| tmp1 = 0xD800 | (c >> 10); |
| tmp = (unsigned char *) out; |
| *tmp = (unsigned char) tmp1; |
| *(tmp + 1) = tmp1 >> 8; |
| out++; |
| |
| tmp2 = 0xDC00 | (c & 0x03FF); |
| tmp = (unsigned char *) out; |
| *tmp = (unsigned char) tmp2; |
| *(tmp + 1) = tmp2 >> 8; |
| out++; |
| } |
| } |
| else |
| break; |
| processed = in; |
| } |
| *outlen = (out - outstart) * 2; |
| *inlen = processed - instart; |
| return(0); |
| } |
| #endif /* LIBXML_OUTPUT_ENABLED */ |
| |
| /** |
| * UTF8ToUTF16: |
| * @outb: a pointer to an array of bytes to store the result |
| * @outlen: the length of @outb |
| * @in: a pointer to an array of UTF-8 chars |
| * @inlen: the length of @in |
| * |
| * Take a block of UTF-8 chars in and try to convert it to an UTF-16 |
| * block of chars out. |
| * |
| * Returns the number of bytes written, or -1 if lack of space, or -2 |
| * if the transcoding failed. |
| */ |
| static int |
| UTF8ToUTF16(unsigned char* outb, int *outlen, |
| const unsigned char* in, int *inlen) |
| { |
| if (in == NULL) { |
| /* |
| * initialization, add the Byte Order Mark for UTF-16LE |
| */ |
| if (*outlen >= 2) { |
| outb[0] = 0xFF; |
| outb[1] = 0xFE; |
| *outlen = 2; |
| *inlen = 0; |
| #ifdef DEBUG_ENCODING |
| xmlGenericError(xmlGenericErrorContext, |
| "Added FFFE Byte Order Mark\n"); |
| #endif |
| return(2); |
| } |
| *outlen = 0; |
| *inlen = 0; |
| return(0); |
| } |
| return (UTF8ToUTF16LE(outb, outlen, in, inlen)); |
| } |
| |
| /** |
| * UTF16BEToUTF8: |
| * @out: a pointer to an array of bytes to store the result |
| * @outlen: the length of @out |
| * @inb: a pointer to an array of UTF-16 passed as a byte array |
| * @inlenb: the length of @in in UTF-16 chars |
| * |
| * Take a block of UTF-16 ushorts in and try to convert it to an UTF-8 |
| * block of chars out. This function assumes the endian property |
| * is the same between the native type of this machine and the |
| * inputed one. |
| * |
| * Returns the number of bytes written, or -1 if lack of space, or -2 |
| * if the transcoding fails (if *in is not a valid utf16 string) |
| * The value of *inlen after return is the number of octets consumed |
| * if the return value is positive, else unpredictable. |
| */ |
| static int |
| UTF16BEToUTF8(unsigned char* out, int *outlen, |
| const unsigned char* inb, int *inlenb) |
| { |
| unsigned char* outstart = out; |
| const unsigned char* processed = inb; |
| unsigned char* outend = out + *outlen; |
| unsigned short* in = (unsigned short*) inb; |
| unsigned short* inend; |
| unsigned int c, d, inlen; |
| unsigned char *tmp; |
| int bits; |
| |
| if ((*inlenb % 2) == 1) |
| (*inlenb)--; |
| inlen = *inlenb / 2; |
| inend= in + inlen; |
| while (in < inend) { |
| if (xmlLittleEndian) { |
| tmp = (unsigned char *) in; |
| c = *tmp++; |
| c = c << 8; |
| c = c | (unsigned int) *tmp; |
| in++; |
| } else { |
| c= *in++; |
| } |
| if ((c & 0xFC00) == 0xD800) { /* surrogates */ |
| if (in >= inend) { /* (in > inend) shouldn't happens */ |
| *outlen = out - outstart; |
| *inlenb = processed - inb; |
| return(-2); |
| } |
| if (xmlLittleEndian) { |
| tmp = (unsigned char *) in; |
| d = *tmp++; |
| d = d << 8; |
| d = d | (unsigned int) *tmp; |
| in++; |
| } else { |
| d= *in++; |
| } |
| if ((d & 0xFC00) == 0xDC00) { |
| c &= 0x03FF; |
| c <<= 10; |
| c |= d & 0x03FF; |
| c += 0x10000; |
| } |
| else { |
| *outlen = out - outstart; |
| *inlenb = processed - inb; |
| return(-2); |
| } |
| } |
| |
| /* assertion: c is a single UTF-4 value */ |
| if (out >= outend) |
| break; |
| if (c < 0x80) { *out++= c; bits= -6; } |
| else if (c < 0x800) { *out++= ((c >> 6) & 0x1F) | 0xC0; bits= 0; } |
| else if (c < 0x10000) { *out++= ((c >> 12) & 0x0F) | 0xE0; bits= 6; } |
| else { *out++= ((c >> 18) & 0x07) | 0xF0; bits= 12; } |
| |
| for ( ; bits >= 0; bits-= 6) { |
| if (out >= outend) |
| break; |
| *out++= ((c >> bits) & 0x3F) | 0x80; |
| } |
| processed = (const unsigned char*) in; |
| } |
| *outlen = out - outstart; |
| *inlenb = processed - inb; |
| return(0); |
| } |
| |
| #ifdef LIBXML_OUTPUT_ENABLED |
| /** |
| * UTF8ToUTF16BE: |
| * @outb: a pointer to an array of bytes to store the result |
| * @outlen: the length of @outb |
| * @in: a pointer to an array of UTF-8 chars |
| * @inlen: the length of @in |
| * |
| * Take a block of UTF-8 chars in and try to convert it to an UTF-16BE |
| * block of chars out. |
| * |
| * Returns the number of byte written, or -1 by lack of space, or -2 |
| * if the transcoding failed. |
| */ |
| static int |
| UTF8ToUTF16BE(unsigned char* outb, int *outlen, |
| const unsigned char* in, int *inlen) |
| { |
| unsigned short* out = (unsigned short*) outb; |
| const unsigned char* processed = in; |
| const unsigned char *const instart = in; |
| unsigned short* outstart= out; |
| unsigned short* outend; |
| const unsigned char* inend= in+*inlen; |
| unsigned int c, d; |
| int trailing; |
| unsigned char *tmp; |
| unsigned short tmp1, tmp2; |
| |
| /* UTF-16BE has no BOM */ |
| if (in == NULL) { |
| *outlen = 0; |
| *inlen = 0; |
| return(0); |
| } |
| outend = out + (*outlen / 2); |
| while (in < inend) { |
| d= *in++; |
| if (d < 0x80) { c= d; trailing= 0; } |
| else if (d < 0xC0) { |
| /* trailing byte in leading position */ |
| *outlen = out - outstart; |
| *inlen = processed - instart; |
| return(-2); |
| } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; } |
| else if (d < 0xF0) { c= d & 0x0F; trailing= 2; } |
| else if (d < 0xF8) { c= d & 0x07; trailing= 3; } |
| else { |
| /* no chance for this in UTF-16 */ |
| *outlen = out - outstart; |
| *inlen = processed - instart; |
| return(-2); |
| } |
| |
| if (inend - in < trailing) { |
| break; |
| } |
| |
| for ( ; trailing; trailing--) { |
| if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80)) break; |
| c <<= 6; |
| c |= d & 0x3F; |
| } |
| |
| /* assertion: c is a single UTF-4 value */ |
| if (c < 0x10000) { |
| if (out >= outend) break; |
| if (xmlLittleEndian) { |
| tmp = (unsigned char *) out; |
| *tmp = c >> 8; |
| *(tmp + 1) = c; |
| out++; |
| } else { |
| *out++ = c; |
| } |
| } |
| else if (c < 0x110000) { |
| if (out+1 >= outend) break; |
| c -= 0x10000; |
| if (xmlLittleEndian) { |
| tmp1 = 0xD800 | (c >> 10); |
| tmp = (unsigned char *) out; |
| *tmp = tmp1 >> 8; |
| *(tmp + 1) = (unsigned char) tmp1; |
| out++; |
| |
| tmp2 = 0xDC00 | (c & 0x03FF); |
| tmp = (unsigned char *) out; |
| *tmp = tmp2 >> 8; |
| *(tmp + 1) = (unsigned char) tmp2; |
| out++; |
| } else { |
| *out++ = 0xD800 | (c >> 10); |
| *out++ = 0xDC00 | (c & 0x03FF); |
| } |
| } |
| else |
| break; |
| processed = in; |
| } |
| *outlen = (out - outstart) * 2; |
| *inlen = processed - instart; |
| return(0); |
| } |
| #endif /* LIBXML_OUTPUT_ENABLED */ |
| |
| /************************************************************************ |
| * * |
| * Generic encoding handling routines * |
| * * |
| ************************************************************************/ |
| |
| /** |
| * xmlDetectCharEncoding: |
| * @in: a pointer to the first bytes of the XML entity, must be at least |
| * 2 bytes long (at least 4 if encoding is UTF4 variant). |
| * @len: pointer to the length of the buffer |
| * |
| * Guess the encoding of the entity using the first bytes of the entity content |
| * according to the non-normative appendix F of the XML-1.0 recommendation. |
| * |
| * Returns one of the XML_CHAR_ENCODING_... values. |
| */ |
| xmlCharEncoding |
| xmlDetectCharEncoding(const unsigned char* in, int len) |
| { |
| if (len >= 4) { |
| if ((in[0] == 0x00) && (in[1] == 0x00) && |
| (in[2] == 0x00) && (in[3] == 0x3C)) |
| return(XML_CHAR_ENCODING_UCS4BE); |
| if ((in[0] == 0x3C) && (in[1] == 0x00) && |
| (in[2] == 0x00) && (in[3] == 0x00)) |
| return(XML_CHAR_ENCODING_UCS4LE); |
| if ((in[0] == 0x00) && (in[1] == 0x00) && |
| (in[2] == 0x3C) && (in[3] == 0x00)) |
| return(XML_CHAR_ENCODING_UCS4_2143); |
| if ((in[0] == 0x00) && (in[1] == 0x3C) && |
| (in[2] == 0x00) && (in[3] == 0x00)) |
| return(XML_CHAR_ENCODING_UCS4_3412); |
| if ((in[0] == 0x4C) && (in[1] == 0x6F) && |
| (in[2] == 0xA7) && (in[3] == 0x94)) |
| return(XML_CHAR_ENCODING_EBCDIC); |
| if ((in[0] == 0x3C) && (in[1] == 0x3F) && |
| (in[2] == 0x78) && (in[3] == 0x6D)) |
| return(XML_CHAR_ENCODING_UTF8); |
| /* |
| * Although not part of the recommendation, we also |
| * attempt an "auto-recognition" of UTF-16LE and |
| * UTF-16BE encodings. |
| */ |
| if ((in[0] == 0x3C) && (in[1] == 0x00) && |
| (in[2] == 0x3F) && (in[3] == 0x00)) |
| return(XML_CHAR_ENCODING_UTF16LE); |
| if ((in[0] == 0x00) && (in[1] == 0x3C) && |
| (in[2] == 0x00) && (in[3] == 0x3F)) |
| return(XML_CHAR_ENCODING_UTF16BE); |
| } |
| if (len >= 3) { |
| /* |
| * Errata on XML-1.0 June 20 2001 |
| * We now allow an UTF8 encoded BOM |
| */ |
| if ((in[0] == 0xEF) && (in[1] == 0xBB) && |
| (in[2] == 0xBF)) |
| return(XML_CHAR_ENCODING_UTF8); |
| } |
| /* For UTF-16 we can recognize by the BOM */ |
| if (len >= 2) { |
| if ((in[0] == 0xFE) && (in[1] == 0xFF)) |
| return(XML_CHAR_ENCODING_UTF16BE); |
| if ((in[0] == 0xFF) && (in[1] == 0xFE)) |
| return(XML_CHAR_ENCODING_UTF16LE); |
| } |
| return(XML_CHAR_ENCODING_NONE); |
| } |
| |
| /** |
| * xmlCleanupEncodingAliases: |
| * |
| * Unregisters all aliases |
| */ |
| void |
| xmlCleanupEncodingAliases(void) { |
| int i; |
| |
| if (xmlCharEncodingAliases == NULL) |
| return; |
| |
| for (i = 0;i < xmlCharEncodingAliasesNb;i++) { |
| if (xmlCharEncodingAliases[i].name != NULL) |
| xmlFree((char *) xmlCharEncodingAliases[i].name); |
| if (xmlCharEncodingAliases[i].alias != NULL) |
| xmlFree((char *) xmlCharEncodingAliases[i].alias); |
| } |
| xmlCharEncodingAliasesNb = 0; |
| xmlCharEncodingAliasesMax = 0; |
| xmlFree(xmlCharEncodingAliases); |
| xmlCharEncodingAliases = NULL; |
| } |
| |
| /** |
| * xmlGetEncodingAlias: |
| * @alias: the alias name as parsed, in UTF-8 format (ASCII actually) |
| * |
| * Lookup an encoding name for the given alias. |
| * |
| * Returns NULL if not found, otherwise the original name |
| */ |
| const char * |
| xmlGetEncodingAlias(const char *alias) { |
| int i; |
| char upper[100]; |
| |
| if (alias == NULL) |
| return(NULL); |
| |
| if (xmlCharEncodingAliases == NULL) |
| return(NULL); |
| |
| for (i = 0;i < 99;i++) { |
| upper[i] = toupper(alias[i]); |
| if (upper[i] == 0) break; |
| } |
| upper[i] = 0; |
| |
| /* |
| * Walk down the list looking for a definition of the alias |
| */ |
| for (i = 0;i < xmlCharEncodingAliasesNb;i++) { |
| if (!strcmp(xmlCharEncodingAliases[i].alias, upper)) { |
| return(xmlCharEncodingAliases[i].name); |
| } |
| } |
| return(NULL); |
| } |
| |
| /** |
| * xmlAddEncodingAlias: |
| * @name: the encoding name as parsed, in UTF-8 format (ASCII actually) |
| * @alias: the alias name as parsed, in UTF-8 format (ASCII actually) |
| * |
| * Registers an alias @alias for an encoding named @name. Existing alias |
| * will be overwritten. |
| * |
| * Returns 0 in case of success, -1 in case of error |
| */ |
| int |
| xmlAddEncodingAlias(const char *name, const char *alias) { |
| int i; |
| char upper[100]; |
| |
| if ((name == NULL) || (alias == NULL)) |
| return(-1); |
| |
| for (i = 0;i < 99;i++) { |
| upper[i] = toupper(alias[i]); |
| if (upper[i] == 0) break; |
| } |
| upper[i] = 0; |
| |
| if (xmlCharEncodingAliases == NULL) { |
| xmlCharEncodingAliasesNb = 0; |
| xmlCharEncodingAliasesMax = 20; |
| xmlCharEncodingAliases = (xmlCharEncodingAliasPtr) |
| xmlMalloc(xmlCharEncodingAliasesMax * sizeof(xmlCharEncodingAlias)); |
| if (xmlCharEncodingAliases == NULL) |
| return(-1); |
| } else if (xmlCharEncodingAliasesNb >= xmlCharEncodingAliasesMax) { |
| xmlCharEncodingAliasesMax *= 2; |
| xmlCharEncodingAliases = (xmlCharEncodingAliasPtr) |
| xmlRealloc(xmlCharEncodingAliases, |
| xmlCharEncodingAliasesMax * sizeof(xmlCharEncodingAlias)); |
| } |
| /* |
| * Walk down the list looking for a definition of the alias |
| */ |
| for (i = 0;i < xmlCharEncodingAliasesNb;i++) { |
| if (!strcmp(xmlCharEncodingAliases[i].alias, upper)) { |
| /* |
| * Replace the definition. |
| */ |
| xmlFree((char *) xmlCharEncodingAliases[i].name); |
| xmlCharEncodingAliases[i].name = xmlMemStrdup(name); |
| return(0); |
| } |
| } |
| /* |
| * Add the definition |
| */ |
| xmlCharEncodingAliases[xmlCharEncodingAliasesNb].name = xmlMemStrdup(name); |
| xmlCharEncodingAliases[xmlCharEncodingAliasesNb].alias = xmlMemStrdup(upper); |
| xmlCharEncodingAliasesNb++; |
| return(0); |
| } |
| |
| /** |
| * xmlDelEncodingAlias: |
| * @alias: the alias name as parsed, in UTF-8 format (ASCII actually) |
| * |
| * Unregisters an encoding alias @alias |
| * |
| * Returns 0 in case of success, -1 in case of error |
| */ |
| int |
| xmlDelEncodingAlias(const char *alias) { |
| int i; |
| |
| if (alias == NULL) |
| return(-1); |
| |
| if (xmlCharEncodingAliases == NULL) |
| return(-1); |
| /* |
| * Walk down the list looking for a definition of the alias |
| */ |
| for (i = 0;i < xmlCharEncodingAliasesNb;i++) { |
| if (!strcmp(xmlCharEncodingAliases[i].alias, alias)) { |
| xmlFree((char *) xmlCharEncodingAliases[i].name); |
| xmlFree((char *) xmlCharEncodingAliases[i].alias); |
| xmlCharEncodingAliasesNb--; |
| memmove(&xmlCharEncodingAliases[i], &xmlCharEncodingAliases[i + 1], |
| sizeof(xmlCharEncodingAlias) * (xmlCharEncodingAliasesNb - i)); |
| return(0); |
| } |
| } |
| return(-1); |
| } |
| |
| /** |
| * xmlParseCharEncoding: |
| * @name: the encoding name as parsed, in UTF-8 format (ASCII actually) |
| * |
| * Compare the string to the encoding schemes already known. Note |
| * that the comparison is case insensitive accordingly to the section |
| * [XML] 4.3.3 Character Encoding in Entities. |
| * |
| * Returns one of the XML_CHAR_ENCODING_... values or XML_CHAR_ENCODING_NONE |
| * if not recognized. |
| */ |
| xmlCharEncoding |
| xmlParseCharEncoding(const char* name) |
| { |
| const char *alias; |
| char upper[500]; |
| int i; |
| |
| if (name == NULL) |
| return(XML_CHAR_ENCODING_NONE); |
| |
| /* |
| * Do the alias resolution |
| */ |
| alias = xmlGetEncodingAlias(name); |
| if (alias != NULL) |
| name = alias; |
| |
| for (i = 0;i < 499;i++) { |
| upper[i] = toupper(name[i]); |
| if (upper[i] == 0) break; |
| } |
| upper[i] = 0; |
| |
| if (!strcmp(upper, "")) return(XML_CHAR_ENCODING_NONE); |
| if (!strcmp(upper, "UTF-8")) return(XML_CHAR_ENCODING_UTF8); |
| if (!strcmp(upper, "UTF8")) return(XML_CHAR_ENCODING_UTF8); |
| |
| /* |
| * NOTE: if we were able to parse this, the endianness of UTF16 is |
| * already found and in use |
| */ |
| if (!strcmp(upper, "UTF-16")) return(XML_CHAR_ENCODING_UTF16LE); |
| if (!strcmp(upper, "UTF16")) return(XML_CHAR_ENCODING_UTF16LE); |
| |
| if (!strcmp(upper, "ISO-10646-UCS-2")) return(XML_CHAR_ENCODING_UCS2); |
| if (!strcmp(upper, "UCS-2")) return(XML_CHAR_ENCODING_UCS2); |
| if (!strcmp(upper, "UCS2")) return(XML_CHAR_ENCODING_UCS2); |
| |
| /* |
| * NOTE: if we were able to parse this, the endianness of UCS4 is |
| * already found and in use |
| */ |
| if (!strcmp(upper, "ISO-10646-UCS-4")) return(XML_CHAR_ENCODING_UCS4LE); |
| if (!strcmp(upper, "UCS-4")) return(XML_CHAR_ENCODING_UCS4LE); |
| if (!strcmp(upper, "UCS4")) return(XML_CHAR_ENCODING_UCS4LE); |
| |
| |
| if (!strcmp(upper, "ISO-8859-1")) return(XML_CHAR_ENCODING_8859_1); |
| if (!strcmp(upper, "ISO-LATIN-1")) return(XML_CHAR_ENCODING_8859_1); |
| if (!strcmp(upper, "ISO LATIN 1")) return(XML_CHAR_ENCODING_8859_1); |
| |
| if (!strcmp(upper, "ISO-8859-2")) return(XML_CHAR_ENCODING_8859_2); |
| if (!strcmp(upper, "ISO-LATIN-2")) return(XML_CHAR_ENCODING_8859_2); |
| if (!strcmp(upper, "ISO LATIN 2")) return(XML_CHAR_ENCODING_8859_2); |
| |
| if (!strcmp(upper, "ISO-8859-3")) return(XML_CHAR_ENCODING_8859_3); |
| if (!strcmp(upper, "ISO-8859-4")) return(XML_CHAR_ENCODING_8859_4); |
| if (!strcmp(upper, "ISO-8859-5")) return(XML_CHAR_ENCODING_8859_5); |
| if (!strcmp(upper, "ISO-8859-6")) return(XML_CHAR_ENCODING_8859_6); |
| if (!strcmp(upper, "ISO-8859-7")) return(XML_CHAR_ENCODING_8859_7); |
| if (!strcmp(upper, "ISO-8859-8")) return(XML_CHAR_ENCODING_8859_8); |
| if (!strcmp(upper, "ISO-8859-9")) return(XML_CHAR_ENCODING_8859_9); |
| |
| if (!strcmp(upper, "ISO-2022-JP")) return(XML_CHAR_ENCODING_2022_JP); |
| if (!strcmp(upper, "SHIFT_JIS")) return(XML_CHAR_ENCODING_SHIFT_JIS); |
| if (!strcmp(upper, "EUC-JP")) return(XML_CHAR_ENCODING_EUC_JP); |
| |
| #ifdef DEBUG_ENCODING |
| xmlGenericError(xmlGenericErrorContext, "Unknown encoding %s\n", name); |
| #endif |
| return(XML_CHAR_ENCODING_ERROR); |
| } |
| |
| /** |
| * xmlGetCharEncodingName: |
| * @enc: the encoding |
| * |
| * The "canonical" name for XML encoding. |
| * C.f. http://www.w3.org/TR/REC-xml#charencoding |
| * Section 4.3.3 Character Encoding in Entities |
| * |
| * Returns the canonical name for the given encoding |
| */ |
| |
| const char* |
| xmlGetCharEncodingName(xmlCharEncoding enc) { |
| switch (enc) { |
| case XML_CHAR_ENCODING_ERROR: |
| return(NULL); |
| case XML_CHAR_ENCODING_NONE: |
| return(NULL); |
| case XML_CHAR_ENCODING_UTF8: |
| return("UTF-8"); |
| case XML_CHAR_ENCODING_UTF16LE: |
| return("UTF-16"); |
| case XML_CHAR_ENCODING_UTF16BE: |
| return("UTF-16"); |
| case XML_CHAR_ENCODING_EBCDIC: |
| return("EBCDIC"); |
| case XML_CHAR_ENCODING_UCS4LE: |
| return("ISO-10646-UCS-4"); |
| case XML_CHAR_ENCODING_UCS4BE: |
| return("ISO-10646-UCS-4"); |
| case XML_CHAR_ENCODING_UCS4_2143: |
| return("ISO-10646-UCS-4"); |
| case XML_CHAR_ENCODING_UCS4_3412: |
| return("ISO-10646-UCS-4"); |
| case XML_CHAR_ENCODING_UCS2: |
| return("ISO-10646-UCS-2"); |
| case XML_CHAR_ENCODING_8859_1: |
| return("ISO-8859-1"); |
| case XML_CHAR_ENCODING_8859_2: |
| return("ISO-8859-2"); |
| case XML_CHAR_ENCODING_8859_3: |
| return("ISO-8859-3"); |
| case XML_CHAR_ENCODING_8859_4: |
| return("ISO-8859-4"); |
| case XML_CHAR_ENCODING_8859_5: |
| return("ISO-8859-5"); |
| case XML_CHAR_ENCODING_8859_6: |
| return("ISO-8859-6"); |
| case XML_CHAR_ENCODING_8859_7: |
| return("ISO-8859-7"); |
| case XML_CHAR_ENCODING_8859_8: |
| return("ISO-8859-8"); |
| case XML_CHAR_ENCODING_8859_9: |
| return("ISO-8859-9"); |
| case XML_CHAR_ENCODING_2022_JP: |
| return("ISO-2022-JP"); |
| case XML_CHAR_ENCODING_SHIFT_JIS: |
| return("Shift-JIS"); |
| case XML_CHAR_ENCODING_EUC_JP: |
| return("EUC-JP"); |
| case XML_CHAR_ENCODING_ASCII: |
| return(NULL); |
| } |
| return(NULL); |
| } |
| |
| /************************************************************************ |
| * * |
| * Char encoding handlers * |
| * * |
| ************************************************************************/ |
| |
| |
| /* the size should be growable, but it's not a big deal ... */ |
| #define MAX_ENCODING_HANDLERS 50 |
| static xmlCharEncodingHandlerPtr *handlers = NULL; |
| static int nbCharEncodingHandler = 0; |
| |
| /* |
| * The default is UTF-8 for XML, that's also the default used for the |
| * parser internals, so the default encoding handler is NULL |
| */ |
| |
| static xmlCharEncodingHandlerPtr xmlDefaultCharEncodingHandler = NULL; |
| |
| /** |
| * xmlNewCharEncodingHandler: |
| * @name: the encoding name, in UTF-8 format (ASCII actually) |
| * @input: the xmlCharEncodingInputFunc to read that encoding |
| * @output: the xmlCharEncodingOutputFunc to write that encoding |
| * |
| * Create and registers an xmlCharEncodingHandler. |
| * |
| * Returns the xmlCharEncodingHandlerPtr created (or NULL in case of error). |
| */ |
| xmlCharEncodingHandlerPtr |
| xmlNewCharEncodingHandler(const char *name, |
| xmlCharEncodingInputFunc input, |
| xmlCharEncodingOutputFunc output) { |
| xmlCharEncodingHandlerPtr handler; |
| const char *alias; |
| char upper[500]; |
| int i; |
| char *up = 0; |
| |
| /* |
| * Do the alias resolution |
| */ |
| alias = xmlGetEncodingAlias(name); |
| if (alias != NULL) |
| name = alias; |
| |
| /* |
| * Keep only the uppercase version of the encoding. |
| */ |
| if (name == NULL) { |
| xmlGenericError(xmlGenericErrorContext, |
| "xmlNewCharEncodingHandler : no name !\n"); |
| return(NULL); |
| } |
| for (i = 0;i < 499;i++) { |
| upper[i] = toupper(name[i]); |
| if (upper[i] == 0) break; |
| } |
| upper[i] = 0; |
| up = xmlMemStrdup(upper); |
| if (up == NULL) { |
| xmlGenericError(xmlGenericErrorContext, |
| "xmlNewCharEncodingHandler : out of memory !\n"); |
| return(NULL); |
| } |
| |
| /* |
| * allocate and fill-up an handler block. |
| */ |
| handler = (xmlCharEncodingHandlerPtr) |
| xmlMalloc(sizeof(xmlCharEncodingHandler)); |
| if (handler == NULL) { |
| xmlGenericError(xmlGenericErrorContext, |
| "xmlNewCharEncodingHandler : out of memory !\n"); |
| return(NULL); |
| } |
| handler->input = input; |
| handler->output = output; |
| handler->name = up; |
| |
| #ifdef LIBXML_ICONV_ENABLED |
| handler->iconv_in = NULL; |
| handler->iconv_out = NULL; |
| #endif /* LIBXML_ICONV_ENABLED */ |
| |
| /* |
| * registers and returns the handler. |
| */ |
| xmlRegisterCharEncodingHandler(handler); |
| #ifdef DEBUG_ENCODING |
| xmlGenericError(xmlGenericErrorContext, |
| "Registered encoding handler for %s\n", name); |
| #endif |
| return(handler); |
| } |
| |
| /** |
| * xmlInitCharEncodingHandlers: |
| * |
| * Initialize the char encoding support, it registers the default |
| * encoding supported. |
| * NOTE: while public, this function usually doesn't need to be called |
| * in normal processing. |
| */ |
| void |
| xmlInitCharEncodingHandlers(void) { |
| unsigned short int tst = 0x1234; |
| unsigned char *ptr = (unsigned char *) &tst; |
| |
| if (handlers != NULL) return; |
| |
| handlers = (xmlCharEncodingHandlerPtr *) |
| xmlMalloc(MAX_ENCODING_HANDLERS * sizeof(xmlCharEncodingHandlerPtr)); |
| |
| if (*ptr == 0x12) xmlLittleEndian = 0; |
| else if (*ptr == 0x34) xmlLittleEndian = 1; |
| else xmlGenericError(xmlGenericErrorContext, |
| "Odd problem at endianness detection\n"); |
| |
| if (handlers == NULL) { |
| xmlGenericError(xmlGenericErrorContext, |
| "xmlInitCharEncodingHandlers : out of memory !\n"); |
| return; |
| } |
| xmlNewCharEncodingHandler("UTF-8", UTF8ToUTF8, UTF8ToUTF8); |
| #ifdef LIBXML_OUTPUT_ENABLED |
| xmlUTF16LEHandler = |
| xmlNewCharEncodingHandler("UTF-16LE", UTF16LEToUTF8, UTF8ToUTF16LE); |
| xmlUTF16BEHandler = |
| xmlNewCharEncodingHandler("UTF-16BE", UTF16BEToUTF8, UTF8ToUTF16BE); |
| xmlNewCharEncodingHandler("UTF-16", UTF16LEToUTF8, UTF8ToUTF16); |
| xmlNewCharEncodingHandler("ISO-8859-1", isolat1ToUTF8, UTF8Toisolat1); |
| xmlNewCharEncodingHandler("ASCII", asciiToUTF8, UTF8Toascii); |
| xmlNewCharEncodingHandler("US-ASCII", asciiToUTF8, UTF8Toascii); |
| #ifdef LIBXML_HTML_ENABLED |
| xmlNewCharEncodingHandler("HTML", NULL, UTF8ToHtml); |
| #endif |
| #else |
| xmlUTF16LEHandler = |
| xmlNewCharEncodingHandler("UTF-16LE", UTF16LEToUTF8, NULL); |
| xmlUTF16BEHandler = |
| xmlNewCharEncodingHandler("UTF-16BE", UTF16BEToUTF8, NULL); |
| xmlNewCharEncodingHandler("UTF-16", UTF16LEToUTF8, NULL); |
| xmlNewCharEncodingHandler("ISO-8859-1", isolat1ToUTF8, NULL); |
| xmlNewCharEncodingHandler("ASCII", asciiToUTF8, NULL); |
| xmlNewCharEncodingHandler("US-ASCII", asciiToUTF8, NULL); |
| #endif /* LIBXML_OUTPUT_ENABLED */ |
| #ifndef LIBXML_ICONV_ENABLED |
| #ifdef LIBXML_ISO8859X_ENABLED |
| xmlRegisterCharEncodingHandlersISO8859x (); |
| #endif |
| #endif |
| |
| } |
| |
| /** |
| * xmlCleanupCharEncodingHandlers: |
| * |
| * Cleanup the memory allocated for the char encoding support, it |
| * unregisters all the encoding handlers and the aliases. |
| */ |
| void |
| xmlCleanupCharEncodingHandlers(void) { |
| xmlCleanupEncodingAliases(); |
| |
| if (handlers == NULL) return; |
| |
| for (;nbCharEncodingHandler > 0;) { |
| nbCharEncodingHandler--; |
| if (handlers[nbCharEncodingHandler] != NULL) { |
| if (handlers[nbCharEncodingHandler]->name != NULL) |
| xmlFree(handlers[nbCharEncodingHandler]->name); |
| xmlFree(handlers[nbCharEncodingHandler]); |
| } |
| } |
| xmlFree(handlers); |
| handlers = NULL; |
| nbCharEncodingHandler = 0; |
| xmlDefaultCharEncodingHandler = NULL; |
| } |
| |
| /** |
| * xmlRegisterCharEncodingHandler: |
| * @handler: the xmlCharEncodingHandlerPtr handler block |
| * |
| * Register the char encoding handler, surprising, isn't it ? |
| */ |
| void |
| xmlRegisterCharEncodingHandler(xmlCharEncodingHandlerPtr handler) { |
| if (handlers == NULL) xmlInitCharEncodingHandlers(); |
| if (handler == NULL) { |
| xmlGenericError(xmlGenericErrorContext, |
| "xmlRegisterCharEncodingHandler: NULL handler !\n"); |
| return; |
| } |
| |
| if (nbCharEncodingHandler >= MAX_ENCODING_HANDLERS) { |
| xmlGenericError(xmlGenericErrorContext, |
| "xmlRegisterCharEncodingHandler: Too many handler registered\n"); |
| xmlGenericError(xmlGenericErrorContext, |
| "\tincrease MAX_ENCODING_HANDLERS : %s\n", __FILE__); |
| return; |
| } |
| handlers[nbCharEncodingHandler++] = handler; |
| } |
| |
| /** |
| * xmlGetCharEncodingHandler: |
| * @enc: an xmlCharEncoding value. |
| * |
| * Search in the registered set the handler able to read/write that encoding. |
| * |
| * Returns the handler or NULL if not found |
| */ |
| xmlCharEncodingHandlerPtr |
| xmlGetCharEncodingHandler(xmlCharEncoding enc) { |
| xmlCharEncodingHandlerPtr handler; |
| |
| if (handlers == NULL) xmlInitCharEncodingHandlers(); |
| switch (enc) { |
| case XML_CHAR_ENCODING_ERROR: |
| return(NULL); |
| case XML_CHAR_ENCODING_NONE: |
| return(NULL); |
| case XML_CHAR_ENCODING_UTF8: |
| return(NULL); |
| case XML_CHAR_ENCODING_UTF16LE: |
| return(xmlUTF16LEHandler); |
| case XML_CHAR_ENCODING_UTF16BE: |
| return(xmlUTF16BEHandler); |
| case XML_CHAR_ENCODING_EBCDIC: |
| handler = xmlFindCharEncodingHandler("EBCDIC"); |
| if (handler != NULL) return(handler); |
| handler = xmlFindCharEncodingHandler("ebcdic"); |
| if (handler != NULL) return(handler); |
| break; |
| case XML_CHAR_ENCODING_UCS4BE: |
| handler = xmlFindCharEncodingHandler("ISO-10646-UCS-4"); |
| if (handler != NULL) return(handler); |
| handler = xmlFindCharEncodingHandler("UCS-4"); |
| if (handler != NULL) return(handler); |
| handler = xmlFindCharEncodingHandler("UCS4"); |
| if (handler != NULL) return(handler); |
| break; |
| case XML_CHAR_ENCODING_UCS4LE: |
| handler = xmlFindCharEncodingHandler("ISO-10646-UCS-4"); |
| if (handler != NULL) return(handler); |
| handler = xmlFindCharEncodingHandler("UCS-4"); |
| if (handler != NULL) return(handler); |
| handler = xmlFindCharEncodingHandler("UCS4"); |
| if (handler != NULL) return(handler); |
| break; |
| case XML_CHAR_ENCODING_UCS4_2143: |
| break; |
| case XML_CHAR_ENCODING_UCS4_3412: |
| break; |
| case XML_CHAR_ENCODING_UCS2: |
| handler = xmlFindCharEncodingHandler("ISO-10646-UCS-2"); |
| if (handler != NULL) return(handler); |
| handler = xmlFindCharEncodingHandler("UCS-2"); |
| if (handler != NULL) return(handler); |
| handler = xmlFindCharEncodingHandler("UCS2"); |
| if (handler != NULL) return(handler); |
| break; |
| |
| /* |
| * We used to keep ISO Latin encodings native in the |
| * generated data. This led to so many problems that |
| * this has been removed. One can still change this |
| * back by registering no-ops encoders for those |
| */ |
| case XML_CHAR_ENCODING_8859_1: |
| handler = xmlFindCharEncodingHandler("ISO-8859-1"); |
| if (handler != NULL) return(handler); |
| break; |
| case XML_CHAR_ENCODING_8859_2: |
| handler = xmlFindCharEncodingHandler("ISO-8859-2"); |
| if (handler != NULL) return(handler); |
| break; |
| case XML_CHAR_ENCODING_8859_3: |
| handler = xmlFindCharEncodingHandler("ISO-8859-3"); |
| if (handler != NULL) return(handler); |
| break; |
| case XML_CHAR_ENCODING_8859_4: |
| handler = xmlFindCharEncodingHandler("ISO-8859-4"); |
| if (handler != NULL) return(handler); |
| break; |
| case XML_CHAR_ENCODING_8859_5: |
| handler = xmlFindCharEncodingHandler("ISO-8859-5"); |
| if (handler != NULL) return(handler); |
| break; |
| case XML_CHAR_ENCODING_8859_6: |
| handler = xmlFindCharEncodingHandler("ISO-8859-6"); |
| if (handler != NULL) return(handler); |
| break; |
| case XML_CHAR_ENCODING_8859_7: |
| handler = xmlFindCharEncodingHandler("ISO-8859-7"); |
| if (handler != NULL) return(handler); |
| break; |
| case XML_CHAR_ENCODING_8859_8: |
| handler = xmlFindCharEncodingHandler("ISO-8859-8"); |
| if (handler != NULL) return(handler); |
| break; |
| case XML_CHAR_ENCODING_8859_9: |
| handler = xmlFindCharEncodingHandler("ISO-8859-9"); |
| if (handler != NULL) return(handler); |
| break; |
| |
| |
| case XML_CHAR_ENCODING_2022_JP: |
| handler = xmlFindCharEncodingHandler("ISO-2022-JP"); |
| if (handler != NULL) return(handler); |
| break; |
| case XML_CHAR_ENCODING_SHIFT_JIS: |
| handler = xmlFindCharEncodingHandler("SHIFT-JIS"); |
| if (handler != NULL) return(handler); |
| handler = xmlFindCharEncodingHandler("SHIFT_JIS"); |
| if (handler != NULL) return(handler); |
| handler = xmlFindCharEncodingHandler("Shift_JIS"); |
| if (handler != NULL) return(handler); |
| break; |
| case XML_CHAR_ENCODING_EUC_JP: |
| handler = xmlFindCharEncodingHandler("EUC-JP"); |
| if (handler != NULL) return(handler); |
| break; |
| default: |
| break; |
| } |
| |
| #ifdef DEBUG_ENCODING |
| xmlGenericError(xmlGenericErrorContext, |
| "No handler found for encoding %d\n", enc); |
| #endif |
| return(NULL); |
| } |
| |
| /** |
| * xmlFindCharEncodingHandler: |
| * @name: a string describing the char encoding. |
| * |
| * Search in the registered set the handler able to read/write that encoding. |
| * |
| * Returns the handler or NULL if not found |
| */ |
| xmlCharEncodingHandlerPtr |
| xmlFindCharEncodingHandler(const char *name) { |
| const char *nalias; |
| const char *norig; |
| xmlCharEncoding alias; |
| #ifdef LIBXML_ICONV_ENABLED |
| xmlCharEncodingHandlerPtr enc; |
| iconv_t icv_in, icv_out; |
| #endif /* LIBXML_ICONV_ENABLED */ |
| char upper[100]; |
| int i; |
| |
| if (handlers == NULL) xmlInitCharEncodingHandlers(); |
| if (name == NULL) return(xmlDefaultCharEncodingHandler); |
| if (name[0] == 0) return(xmlDefaultCharEncodingHandler); |
| |
| /* |
| * Do the alias resolution |
| */ |
| norig = name; |
| nalias = xmlGetEncodingAlias(name); |
| if (nalias != NULL) |
| name = nalias; |
| |
| /* |
| * Check first for directly registered encoding names |
| */ |
| for (i = 0;i < 99;i++) { |
| upper[i] = toupper(name[i]); |
| if (upper[i] == 0) break; |
| } |
| upper[i] = 0; |
| |
| for (i = 0;i < nbCharEncodingHandler; i++) |
| if (!strcmp(upper, handlers[i]->name)) { |
| #ifdef DEBUG_ENCODING |
| xmlGenericError(xmlGenericErrorContext, |
| "Found registered handler for encoding %s\n", name); |
| #endif |
| return(handlers[i]); |
| } |
| |
| #ifdef LIBXML_ICONV_ENABLED |
| /* check whether iconv can handle this */ |
| icv_in = iconv_open("UTF-8", name); |
| icv_out = iconv_open(name, "UTF-8"); |
| if ((icv_in != (iconv_t) -1) && (icv_out != (iconv_t) -1)) { |
| enc = (xmlCharEncodingHandlerPtr) |
| xmlMalloc(sizeof(xmlCharEncodingHandler)); |
| if (enc == NULL) { |
| iconv_close(icv_in); |
| iconv_close(icv_out); |
| return(NULL); |
| } |
| enc->name = xmlMemStrdup(name); |
| enc->input = NULL; |
| enc->output = NULL; |
| enc->iconv_in = icv_in; |
| enc->iconv_out = icv_out; |
| #ifdef DEBUG_ENCODING |
| xmlGenericError(xmlGenericErrorContext, |
| "Found iconv handler for encoding %s\n", name); |
| #endif |
| return enc; |
| } else if ((icv_in != (iconv_t) -1) || icv_out != (iconv_t) -1) { |
| xmlGenericError(xmlGenericErrorContext, |
| "iconv : problems with filters for '%s'\n", name); |
| } |
| #endif /* LIBXML_ICONV_ENABLED */ |
| |
| #ifdef DEBUG_ENCODING |
| xmlGenericError(xmlGenericErrorContext, |
| "No handler found for encoding %s\n", name); |
| #endif |
| |
| /* |
| * Fallback using the canonical names |
| */ |
| alias = xmlParseCharEncoding(norig); |
| if (alias != XML_CHAR_ENCODING_ERROR) { |
| const char* canon; |
| canon = xmlGetCharEncodingName(alias); |
| if ((canon != NULL) && (strcmp(name, canon))) { |
| return(xmlFindCharEncodingHandler(canon)); |
| } |
| } |
| |
| /* If "none of the above", give up */ |
| return(NULL); |
| } |
| |
| /************************************************************************ |
| * * |
| * ICONV based generic conversion functions * |
| * * |
| ************************************************************************/ |
| |
| #ifdef LIBXML_ICONV_ENABLED |
| /** |
| * xmlIconvWrapper: |
| * @cd: iconv converter data structure |
| * @out: a pointer to an array of bytes to store the result |
| * @outlen: the length of @out |
| * @in: a pointer to an array of ISO Latin 1 chars |
| * @inlen: the length of @in |
| * |
| * Returns 0 if success, or |
| * -1 by lack of space, or |
| * -2 if the transcoding fails (for *in is not valid utf8 string or |
| * the result of transformation can't fit into the encoding we want), or |
| * -3 if there the last byte can't form a single output char. |
| * |
| * The value of @inlen after return is the number of octets consumed |
| * as the return value is positive, else unpredictable. |
| * The value of @outlen after return is the number of ocetes consumed. |
| */ |
| static int |
| xmlIconvWrapper(iconv_t cd, |
| unsigned char *out, int *outlen, |
| const unsigned char *in, int *inlen) { |
| |
| size_t icv_inlen = *inlen, icv_outlen = *outlen; |
| const char *icv_in = (const char *) in; |
| char *icv_out = (char *) out; |
| int ret; |
| |
| ret = iconv(cd, (char **) &icv_in, &icv_inlen, &icv_out, &icv_outlen); |
| if (in != NULL) { |
| *inlen -= icv_inlen; |
| *outlen -= icv_outlen; |
| } else { |
| *inlen = 0; |
| *outlen = 0; |
| } |
| if ((icv_inlen != 0) || (ret == -1)) { |
| #ifdef EILSEQ |
| if (errno == EILSEQ) { |
| return -2; |
| } else |
| #endif |
| #ifdef E2BIG |
| if (errno == E2BIG) { |
| return -1; |
| } else |
| #endif |
| #ifdef EINVAL |
| if (errno == EINVAL) { |
| return -3; |
| } else |
| #endif |
| { |
| return -3; |
| } |
| } |
| return 0; |
| } |
| #endif /* LIBXML_ICONV_ENABLED */ |
| |
| /************************************************************************ |
| * * |
| * The real API used by libxml for on-the-fly conversion * |
| * * |
| ************************************************************************/ |
| |
| /** |
| * xmlCharEncFirstLine: |
| * @handler: char enconding transformation data structure |
| * @out: an xmlBuffer for the output. |
| * @in: an xmlBuffer for the input |
| * |
| * Front-end for the encoding handler input function, but handle only |
| * the very first line, i.e. limit itself to 45 chars. |
| * |
| * Returns the number of byte written if success, or |
| * -1 general error |
| * -2 if the transcoding fails (for *in is not valid utf8 string or |
| * the result of transformation can't fit into the encoding we want), or |
| */ |
| int |
| xmlCharEncFirstLine(xmlCharEncodingHandler *handler, xmlBufferPtr out, |
| xmlBufferPtr in) { |
| int ret = -2; |
| int written; |
| int toconv; |
| |
| if (handler == NULL) return(-1); |
| if (out == NULL) return(-1); |
| if (in == NULL) return(-1); |
| |
| written = out->size - out->use; |
| toconv = in->use; |
| if (toconv * 2 >= written) { |
| xmlBufferGrow(out, toconv); |
| written = out->size - out->use - 1; |
| } |
| |
| /* |
| * echo '<?xml version="1.0" encoding="UCS4"?>' | wc -c => 38 |
| * 45 chars should be sufficient to reach the end of the encoding |
| * declaration without going too far inside the document content. |
| */ |
| written = 45; |
| |
| if (handler->input != NULL) { |
| ret = handler->input(&out->content[out->use], &written, |
| in->content, &toconv); |
| xmlBufferShrink(in, toconv); |
| out->use += written; |
| out->content[out->use] = 0; |
| } |
| #ifdef LIBXML_ICONV_ENABLED |
| else if (handler->iconv_in != NULL) { |
| ret = xmlIconvWrapper(handler->iconv_in, &out->content[out->use], |
| &written, in->content, &toconv); |
| xmlBufferShrink(in, toconv); |
| out->use += written; |
| out->content[out->use] = 0; |
| if (ret == -1) ret = -3; |
| } |
| #endif /* LIBXML_ICONV_ENABLED */ |
| #ifdef DEBUG_ENCODING |
| switch (ret) { |
| case 0: |
| xmlGenericError(xmlGenericErrorContext, |
| "converted %d bytes to %d bytes of input\n", |
| toconv, written); |
| break; |
| case -1: |
| xmlGenericError(xmlGenericErrorContext,"converted %d bytes to %d bytes of input, %d left\n", |
| toconv, written, in->use); |
| break; |
| case -2: |
| xmlGenericError(xmlGenericErrorContext, |
| "input conversion failed due to input error\n"); |
| break; |
| case -3: |
| xmlGenericError(xmlGenericErrorContext,"converted %d bytes to %d bytes of input, %d left\n", |
| toconv, written, in->use); |
| break; |
| default: |
| xmlGenericError(xmlGenericErrorContext,"Unknown input conversion failed %d\n", ret); |
| } |
| #endif /* DEBUG_ENCODING */ |
| /* |
| * Ignore when input buffer is not on a boundary |
| */ |
| if (ret == -3) ret = 0; |
| if (ret == -1) ret = 0; |
| return(ret); |
| } |
| |
| /** |
| * xmlCharEncInFunc: |
| * @handler: char encoding transformation data structure |
| * @out: an xmlBuffer for the output. |
| * @in: an xmlBuffer for the input |
| * |
| * Generic front-end for the encoding handler input function |
| * |
| * Returns the number of byte written if success, or |
| * -1 general error |
| * -2 if the transcoding fails (for *in is not valid utf8 string or |
| * the result of transformation can't fit into the encoding we want), or |
| */ |
| int |
| xmlCharEncInFunc(xmlCharEncodingHandler * handler, xmlBufferPtr out, |
| xmlBufferPtr in) |
| { |
| int ret = -2; |
| int written; |
| int toconv; |
| |
| if (handler == NULL) |
| return (-1); |
| if (out == NULL) |
| return (-1); |
| if (in == NULL) |
| return (-1); |
| |
| toconv = in->use; |
| if (toconv == 0) |
| return (0); |
| written = out->size - out->use; |
| if (toconv * 2 >= written) { |
| xmlBufferGrow(out, out->size + toconv * 2); |
| written = out->size - out->use - 1; |
| } |
| if (handler->input != NULL) { |
| ret = handler->input(&out->content[out->use], &written, |
| in->content, &toconv); |
| xmlBufferShrink(in, toconv); |
| out->use += written; |
| out->content[out->use] = 0; |
| } |
| #ifdef LIBXML_ICONV_ENABLED |
| else if (handler->iconv_in != NULL) { |
| ret = xmlIconvWrapper(handler->iconv_in, &out->content[out->use], |
| &written, in->content, &toconv); |
| xmlBufferShrink(in, toconv); |
| out->use += written; |
| out->content[out->use] = 0; |
| if (ret == -1) |
| ret = -3; |
| } |
| #endif /* LIBXML_ICONV_ENABLED */ |
| switch (ret) { |
| case 0: |
| #ifdef DEBUG_ENCODING |
| xmlGenericError(xmlGenericErrorContext, |
| "converted %d bytes to %d bytes of input\n", |
| toconv, written); |
| #endif |
| break; |
| case -1: |
| #ifdef DEBUG_ENCODING |
| xmlGenericError(xmlGenericErrorContext, |
| "converted %d bytes to %d bytes of input, %d left\n", |
| toconv, written, in->use); |
| #endif |
| break; |
| case -3: |
| #ifdef DEBUG_ENCODING |
| xmlGenericError(xmlGenericErrorContext, |
| "converted %d bytes to %d bytes of input, %d left\n", |
| toconv, written, in->use); |
| #endif |
| break; |
| case -2: |
| xmlGenericError(xmlGenericErrorContext, |
| "input conversion failed due to input error\n"); |
| xmlGenericError(xmlGenericErrorContext, |
| "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n", |
| in->content[0], in->content[1], |
| in->content[2], in->content[3]); |
| } |
| /* |
| * Ignore when input buffer is not on a boundary |
| */ |
| if (ret == -3) |
| ret = 0; |
| return (written); |
| } |
| |
| /** |
| * xmlCharEncOutFunc: |
| * @handler: char enconding transformation data structure |
| * @out: an xmlBuffer for the output. |
| * @in: an xmlBuffer for the input |
| * |
| * Generic front-end for the encoding handler output function |
| * a first call with @in == NULL has to be made firs to initiate the |
| * output in case of non-stateless encoding needing to initiate their |
| * state or the output (like the BOM in UTF16). |
| * In case of UTF8 sequence conversion errors for the given encoder, |
| * the content will be automatically remapped to a CharRef sequence. |
| * |
| * Returns the number of byte written if success, or |
| * -1 general error |
| * -2 if the transcoding fails (for *in is not valid utf8 string or |
| * the result of transformation can't fit into the encoding we want), or |
| */ |
| int |
| xmlCharEncOutFunc(xmlCharEncodingHandler *handler, xmlBufferPtr out, |
| xmlBufferPtr in) { |
| int ret = -2; |
| int written; |
| int writtentot = 0; |
| int toconv; |
| int output = 0; |
| |
| if (handler == NULL) return(-1); |
| if (out == NULL) return(-1); |
| |
| retry: |
| |
| written = out->size - out->use; |
| |
| if (written > 0) |
| written--; /* Gennady: count '/0' */ |
| |
| /* |
| * First specific handling of in = NULL, i.e. the initialization call |
| */ |
| if (in == NULL) { |
| toconv = 0; |
| if (handler->output != NULL) { |
| ret = handler->output(&out->content[out->use], &written, |
| NULL, &toconv); |
| if (ret >= 0) { /* Gennady: check return value */ |
| out->use += written; |
| out->content[out->use] = 0; |
| } |
| } |
| #ifdef LIBXML_ICONV_ENABLED |
| else if (handler->iconv_out != NULL) { |
| ret = xmlIconvWrapper(handler->iconv_out, &out->content[out->use], |
| &written, NULL, &toconv); |
| out->use += written; |
| out->content[out->use] = 0; |
| } |
| #endif /* LIBXML_ICONV_ENABLED */ |
| #ifdef DEBUG_ENCODING |
| xmlGenericError(xmlGenericErrorContext, |
| "initialized encoder\n"); |
| #endif |
| return(0); |
| } |
| |
| /* |
| * Conversion itself. |
| */ |
| toconv = in->use; |
| if (toconv == 0) |
| return(0); |
| if (toconv * 2 >= written) { |
| xmlBufferGrow(out, toconv * 2); |
| written = out->size - out->use - 1; |
| } |
| if (handler->output != NULL) { |
| ret = handler->output(&out->content[out->use], &written, |
| in->content, &toconv); |
| xmlBufferShrink(in, toconv); |
| out->use += written; |
| writtentot += written; |
| out->content[out->use] = 0; |
| } |
| #ifdef LIBXML_ICONV_ENABLED |
| else if (handler->iconv_out != NULL) { |
| ret = xmlIconvWrapper(handler->iconv_out, &out->content[out->use], |
| &written, in->content, &toconv); |
| xmlBufferShrink(in, toconv); |
| out->use += written; |
| writtentot += written; |
| out->content[out->use] = 0; |
| if (ret == -1) { |
| if (written > 0) { |
| /* |
| * Can be a limitation of iconv |
| */ |
| goto retry; |
| } |
| ret = -3; |
| } |
| } |
| #endif /* LIBXML_ICONV_ENABLED */ |
| else { |
| xmlGenericError(xmlGenericErrorContext, |
| "xmlCharEncOutFunc: no output function !\n"); |
| return(-1); |
| } |
| |
| if (ret >= 0) output += ret; |
| |
| /* |
| * Attempt to handle error cases |
| */ |
| switch (ret) { |
| case 0: |
| #ifdef DEBUG_ENCODING |
| xmlGenericError(xmlGenericErrorContext, |
| "converted %d bytes to %d bytes of output\n", |
| toconv, written); |
| #endif |
| break; |
| case -1: |
| #ifdef DEBUG_ENCODING |
| xmlGenericError(xmlGenericErrorContext, |
| "output conversion failed by lack of space\n"); |
| #endif |
| break; |
| case -3: |
| #ifdef DEBUG_ENCODING |
| xmlGenericError(xmlGenericErrorContext,"converted %d bytes to %d bytes of output %d left\n", |
| toconv, written, in->use); |
| #endif |
| break; |
| case -2: { |
| int len = in->use; |
| const xmlChar *utf = (const xmlChar *) in->content; |
| int cur; |
| |
| cur = xmlGetUTF8Char(utf, &len); |
| if (cur > 0) { |
| xmlChar charref[20]; |
| |
| #ifdef DEBUG_ENCODING |
| xmlGenericError(xmlGenericErrorContext, |
| "handling output conversion error\n"); |
| xmlGenericError(xmlGenericErrorContext, |
| "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n", |
| in->content[0], in->content[1], |
| in->content[2], in->content[3]); |
| #endif |
| /* |
| * Removes the UTF8 sequence, and replace it by a charref |
| * and continue the transcoding phase, hoping the error |
| * did not mangle the encoder state. |
| */ |
| snprintf((char *) charref, sizeof(charref), "&#%d;", cur); |
| xmlBufferShrink(in, len); |
| xmlBufferAddHead(in, charref, -1); |
| |
| goto retry; |
| } else { |
| xmlGenericError(xmlGenericErrorContext, |
| "output conversion failed due to conv error\n"); |
| xmlGenericError(xmlGenericErrorContext, |
| "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n", |
| in->content[0], in->content[1], |
| in->content[2], in->content[3]); |
| in->content[0] = ' '; |
| } |
| break; |
| } |
| } |
| return(ret); |
| } |
| |
| /** |
| * xmlCharEncCloseFunc: |
| * @handler: char enconding transformation data structure |
| * |
| * Generic front-end for encoding handler close function |
| * |
| * Returns 0 if success, or -1 in case of error |
| */ |
| int |
| xmlCharEncCloseFunc(xmlCharEncodingHandler *handler) { |
| int ret = 0; |
| if (handler == NULL) return(-1); |
| if (handler->name == NULL) return(-1); |
| #ifdef LIBXML_ICONV_ENABLED |
| /* |
| * Iconv handlers can be used only once, free the whole block. |
| * and the associated icon resources. |
| */ |
| if ((handler->iconv_out != NULL) || (handler->iconv_in != NULL)) { |
| if (handler->name != NULL) |
| xmlFree(handler->name); |
| handler->name = NULL; |
| if (handler->iconv_out != NULL) { |
| if (iconv_close(handler->iconv_out)) |
| ret = -1; |
| handler->iconv_out = NULL; |
| } |
| if (handler->iconv_in != NULL) { |
| if (iconv_close(handler->iconv_in)) |
| ret = -1; |
| handler->iconv_in = NULL; |
| } |
| xmlFree(handler); |
| } |
| #endif /* LIBXML_ICONV_ENABLED */ |
| #ifdef DEBUG_ENCODING |
| if (ret) |
| xmlGenericError(xmlGenericErrorContext, |
| "failed to close the encoding handler\n"); |
| else |
| xmlGenericError(xmlGenericErrorContext, |
| "closed the encoding handler\n"); |
| #endif |
| |
| return(ret); |
| } |
| |
| #ifndef LIBXML_ICONV_ENABLED |
| #ifdef LIBXML_ISO8859X_ENABLED |
| |
| /** |
| * UTF8ToISO8859x: |
| * @out: a pointer to an array of bytes to store the result |
| * @outlen: the length of @out |
| * @in: a pointer to an array of UTF-8 chars |
| * @inlen: the length of @in |
| * @xlattable: the 2-level transcoding table |
| * |
| * Take a block of UTF-8 chars in and try to convert it to an ISO 8859-* |
| * block of chars out. |
| * |
| * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise |
| * The value of @inlen after return is the number of octets consumed |
| * as the return value is positive, else unpredictable. |
| * The value of @outlen after return is the number of ocetes consumed. |
| */ |
| static int |
| UTF8ToISO8859x(unsigned char* out, int *outlen, |
| const unsigned char* in, int *inlen, |
| unsigned char const *xlattable) { |
| const unsigned char* outstart = out; |
| const unsigned char* inend; |
| const unsigned char* instart = in; |
| |
| if (in == NULL) { |
| /* |
| * initialization nothing to do |
| */ |
| *outlen = 0; |
| *inlen = 0; |
| return(0); |
| } |
| inend = in + (*inlen); |
| while (in < inend) { |
| unsigned char d = *in++; |
| if (d < 0x80) { |
| *out++ = d; |
| } else if (d < 0xC0) { |
| /* trailing byte in leading position */ |
| *outlen = out - outstart; |
| *inlen = in - instart - 1; |
| return(-2); |
| } else if (d < 0xE0) { |
| unsigned char c; |
| if (!(in < inend)) { |
| /* trailing byte not in input buffer */ |
| *outlen = out - outstart; |
| *inlen = in - instart - 1; |
| return(-2); |
| } |
| c = *in++; |
| if ((c & 0xC0) != 0xC0) { |
| /* not a trailing byte */ |
| *outlen = out - outstart; |
| *inlen = in - instart - 2; |
| return(-2); |
| } |
| c = c & 0x3F; |
| d = d & 0x1F; |
| d = xlattable [48 + c + xlattable [d] * 64]; |
| if (d == 0) { |
| /* not in character set */ |
| *outlen = out - outstart; |
| *inlen = in - instart - 2; |
| return(-2); |
| } |
| *out++ = d; |
| } else if (d < 0xF0) { |
| unsigned char c1; |
| unsigned char c2; |
| if (!(in < inend - 1)) { |
| /* trailing bytes not in input buffer */ |
| *outlen = out - outstart; |
| *inlen = in - instart - 1; |
| return(-2); |
| } |
| c1 = *in++; |
| if ((c1 & 0xC0) != 0xC0) { |
| /* not a trailing byte (c1) */ |
| *outlen = out - outstart; |
| *inlen = in - instart - 2; |
| return(-2); |
| } |
| c2 = *in++; |
| if ((c2 & 0xC0) != 0xC0) { |
| /* not a trailing byte (c2) */ |
| *outlen = out - outstart; |
| *inlen = in - instart - 2; |
| return(-2); |
| } |
| c1 = c1 & 0x3F; |
| c2 = c2 & 0x3F; |
| d = d & 0x0F; |
| d = xlattable [48 + c2 + xlattable [48 + c1 + xlattable [32 + d] * 64] * 64]; |
| if (d == 0) { |
| /* not in character set */ |
| *outlen = out - outstart; |
| *inlen = in - instart - 3; |
| return(-2); |
| } |
| *out++ = d; |
| } else { |
| /* cannot transcode >= U+010000 */ |
| *outlen = out - outstart; |
| *inlen = in - instart - 1; |
| return(-2); |
| } |
| } |
| *outlen = out - outstart; |
| *inlen = in - instart; |
| return(0); |
| } |
| |
| /** |
| * ISO8859xToUTF8 |
| * @out: a pointer to an array of bytes to store the result |
| * @outlen: the length of @out |
| * @in: a pointer to an array of ISO Latin 1 chars |
| * @inlen: the length of @in |
| * |
| * Take a block of ISO 8859-* chars in and try to convert it to an UTF-8 |
| * block of chars out. |
| * Returns 0 if success, or -1 otherwise |
| * The value of @inlen after return is the number of octets consumed |
| * The value of @outlen after return is the number of ocetes produced. |
| */ |
| static int |
| ISO8859xToUTF8(unsigned char* out, int *outlen, |
| const unsigned char* in, int *inlen, |
| unsigned short const *unicodetable) { |
| unsigned char* outstart = out; |
| unsigned char* outend = out + *outlen; |
| const unsigned char* instart = in; |
| const unsigned char* inend = in + *inlen; |
| const unsigned char* instop = inend; |
| unsigned int c = *in; |
| |
| while (in < inend && out < outend - 1) { |
| if (c >= 0x80) { |
| c = unicodetable [c - 0x80]; |
| if (c == 0) { |
| /* undefined code point */ |
| *outlen = out - outstart; |
| *inlen = in - instart; |
| return (-1); |
| } |
| if (c < 0x800) { |
| *out++ = ((c >> 6) & 0x1F) | 0xC0; |
| *out++ = (c & 0x3F) | 0x80; |
| } else { |
| *out++ = ((c >> 12) & 0x0F) | 0xE0; |
| *out++ = ((c >> 6) & 0x3F) | 0x80; |
| *out++ = (c & 0x3F) | 0x80; |
| } |
| ++in; |
| c = *in; |
| } |
| if (instop - in > outend - out) instop = in + (outend - out); |
| while (c < 0x80 && in < instop) { |
| *out++ = c; |
| ++in; |
| c = *in; |
| } |
| } |
| if (in < inend && out < outend && c < 0x80) { |
| *out++ = c; |
| ++in; |
| } |
| *outlen = out - outstart; |
| *inlen = in - instart; |
| return (0); |
| } |
| |
| |
| /************************************************************************ |
| * Lookup tables for ISO-8859-2..ISO-8859-16 transcoding * |
| ************************************************************************/ |
| |
| static unsigned short const xmlunicodetable_ISO8859_2 [128] = { |
| 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, |
| 0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, |
| 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, |
| 0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f, |
| 0x00a0, 0x0104, 0x02d8, 0x0141, 0x00a4, 0x013d, 0x015a, 0x00a7, |
| 0x00a8, 0x0160, 0x015e, 0x0164, 0x0179, 0x00ad, 0x017d, 0x017b, |
| 0x00b0, 0x0105, 0x02db, 0x0142, 0x00b4, 0x013e, 0x015b, 0x02c7, |
| 0x00b8, 0x0161, 0x015f, 0x0165, 0x017a, 0x02dd, 0x017e, 0x017c, |
| 0x0154, 0x00c1, 0x00c2, 0x0102, 0x00c4, 0x0139, 0x0106, 0x00c7, |
| 0x010c, 0x00c9, 0x0118, 0x00cb, 0x011a, 0x00cd, 0x00ce, 0x010e, |
| 0x0110, 0x0143, 0x0147, 0x00d3, 0x00d4, 0x0150, 0x00d6, 0x00d7, |
| 0x0158, 0x016e, 0x00da, 0x0170, 0x00dc, 0x00dd, 0x0162, 0x00df, |
| 0x0155, 0x00e1, 0x00e2, 0x0103, 0x00e4, 0x013a, 0x0107, 0x00e7, |
| 0x010d, 0x00e9, 0x0119, 0x00eb, 0x011b, 0x00ed, 0x00ee, 0x010f, |
| 0x0111, 0x0144, 0x0148, 0x00f3, 0x00f4, 0x0151, 0x00f6, 0x00f7, |
| 0x0159, 0x016f, 0x00fa, 0x0171, 0x00fc, 0x00fd, 0x0163, 0x02d9, |
| }; |
| |
| static unsigned char const xmltranscodetable_ISO8859_2 [48 + 6 * 64] = { |
| "\x00\x00\x01\x05\x02\x04\x00\x00\x00\x00\x00\x03\x00\x00\x00\x00" |
| "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" |
| "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" |
| "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" |
| "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" |
| "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" |
| "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" |
| "\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f" |
| "\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f" |
| "\xa0\x00\x00\x00\xa4\x00\x00\xa7\xa8\x00\x00\x00\x00\xad\x00\x00" |
| "\xb0\x00\x00\x00\xb4\x00\x00\x00\xb8\x00\x00\x00\x00\x00\x00\x00" |
| "\x00\x00\xc3\xe3\xa1\xb1\xc6\xe6\x00\x00\x00\x00\xc8\xe8\xcf\xef" |
| "\xd0\xf0\x00\x00\x00\x00\x00\x00\xca\xea\xcc\xec\x00\x00\x00\x00" |
| "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" |
| "\x00\x00\x00\x00\x00\x00\x00\x00\x00\xc5\xe5\x00\x00\xa5\xb5\x00" |
| "\x00\x00\x00\x00\x00\x00\x00\xb7\x00\x00\x00\x00\x00\x00\x00\x00" |
| "\x00\x00\x00\x00\x00\x00\x00\x00\xa2\xff\x00\xb2\x00\xbd\x00\x00" |
| "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" |
| "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" |
| "\x00\xa3\xb3\xd1\xf1\x00\x00\xd2\xf2\x00\x00\x00\x00\x00\x00\x00" |
| "\xd5\xf5\x00\x00\xc0\xe0\x00\x00\xd8\xf8\xa6\xb6\x00\x00\xaa\xba" |
| "\xa9\xb9\xde\xfe\xab\xbb\x00\x00\x00\x00\x00\x00\x00\x00\xd9\xf9" |
| "\xdb\xfb\x00\x00\x00\x00\x00\x00\x00\xac\xbc\xaf\xbf\xae\xbe\x00" |
| "\x00\xc1\xc2\x00\xc4\x00\x00\xc7\x00\xc9\x00\xcb\x00\xcd\xce\x00" |
| "\x00\x00\x00\xd3\xd4\x00\xd6\xd7\x00\x00\xda\x00\xdc\xdd\x00\xdf" |
| "\x00\xe1\xe2\x00\xe4\x00\x00\xe7\x00\xe9\x00\xeb\x00\xed\xee\x00" |
| "\x00\x00\x00\xf3\xf4\x00\xf6\xf7\x00\x00\xfa\x00\xfc\xfd\x00\x00" |
| }; |
| |
| static unsigned short const xmlunicodetable_ISO8859_3 [128] = { |
| 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, |
| 0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, |
| 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, |
| 0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f, |
| 0x00a0, 0x0126, 0x02d8, 0x00a3, 0x00a4, 0x0000, 0x0124, 0x00a7, |
| 0x00a8, 0x0130, 0x015e, 0x011e, 0x0134, 0x00ad, 0x0000, 0x017b, |
| 0x00b0, 0x0127, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x0125, 0x00b7, |
| 0x00b8, 0x0131, 0x015f, 0x011f, 0x0135, 0x00bd, 0x0000, 0x017c, |
| 0x00c0, 0x00c1, 0x00c2, 0x0000, 0x00c4, 0x010a, 0x0108, 0x00c7, |
| 0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf, |
| 0x0000, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x0120, 0x00d6, 0x00d7, |
| 0x011c, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x016c, 0x015c, 0x00df, |
| 0x00e0, 0x00e1, 0x00e2, 0x0000, 0x00e4, 0x010b, 0x0109, 0x00e7, |
| 0x00e8, 0x00e9, 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef, |
| 0x0000, 0x00f1, 0x00f2, 0x00f3, 0x00f4, 0x0121, 0x00f6, 0x00f7, |
| 0x011d, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x016d, 0x015d, 0x02d9, |
| }; |
| |
| static unsigned char const xmltranscodetable_ISO8859_3 [48 + 7 * 64] = { |
| "\x04\x00\x01\x06\x02\x05\x00\x00\x00\x00\x00\x03\x00\x00\x00\x00" |
| "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" |
| "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" |
| "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" |
| "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" |
| "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" |
| "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" |
| "\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f" |
| "\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f" |
| "\xa0\x00\x00\xa3\xa4\x00\x00\xa7\xa8\x00\x00\x00\x00\xad\x00\x00" |
| "\xb0\x00\xb2\xb3\xb4\xb5\x00\xb7\xb8\x00\x00\x00\x00\xbd\x00\x00" |
| "\x00\x00\x00\x00\x00\x00\x00\x00\xc6\xe6\xc5\xe5\x00\x00\x00\x00" |
| "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xd8\xf8\xab\xbb" |
| "\xd5\xf5\x00\x00\xa6\xb6\xa1\xb1\x00\x00\x00\x00\x00\x00\x00\x00" |
| "\xa9\xb9\x00\x00\xac\xbc\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" |
| "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" |
| "\x00\x00\x00\x00\x00\x00\x00\x00\xa2\xff\x00\x00\x00\x00\x00\x00" |
| "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" |
| "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" |
| "\xf0\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" |
| "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" |
| "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" |
| "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" |
| "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" |
| "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xde\xfe\xaa\xba" |
| "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xdd\xfd\x00\x00" |
| "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xaf\xbf\x00\x00\x00" |
| "\xc0\xc1\xc2\x00\xc4\x00\x00\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf" |
| "\x00\xd1\xd2\xd3\xd4\x00\xd6\xd7\x00\xd9\xda\xdb\xdc\x00\x00\xdf" |
| "\xe0\xe1\xe2\x00\xe4\x00\x00\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef" |
| "\x00\xf1\xf2\xf3\xf4\x00\xf6\xf7\x00\xf9\xfa\xfb\xfc\x00\x00\x00" |
| }; |
| |
| static unsigned short const xmlunicodetable_ISO8859_4 [128] = { |
| 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, |
| 0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, |
| 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, |
| 0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f, |
| 0x00a0, 0x0104, 0x0138, 0x0156, 0x00a4, 0x0128, 0x013b, 0x00a7, |
| 0x00a8, 0x0160, 0x0112, 0x0122, 0x0166, 0x00ad, 0x017d, 0x00af, |
| 0x00b0, 0x0105, 0x02db, 0x0157, 0x00b4, 0x0129, 0x013c, 0x02c7, |
| 0x00b8, 0x0161, 0x0113, 0x0123, 0x0167, 0x014a, 0x017e, 0x014b, |
| 0x0100, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x012e, |
| 0x010c, 0x00c9, 0x0118, 0x00cb, 0x0116, 0x00cd, 0x00ce, 0x012a, |
| 0x0110, 0x0145, 0x014c, 0x0136, 0x00d4, 0x00d5, 0x00d6, 0x00d7, |
| 0x00d8, 0x0172, 0x00da, 0x00db, 0x00dc, 0x0168, 0x016a, 0x00df, |
| 0x0101, 0x00e1, 0x00e2, 0x00e3, 0x00e4, 0x00e5, 0x00e6, 0x012f, |
| 0x010d, 0x00e9, 0x0119, 0x00eb, 0x0117, 0x00ed, 0x00ee, 0x012b, |
| 0x0111, 0x0146, 0x014d, 0x0137, 0x00f4, 0x00f5, 0x00f6, 0x00f7, |
| 0x00f8, 0x0173, 0x00fa, 0x00fb, 0x00fc, 0x0169, 0x016b, 0x02d9, |
| }; |
| |
| static unsigned char const xmltranscodetable_ISO8859_4 [48 + 6 * 64] = { |
| "\x00\x00\x01\x05\x02\x03\x00\x00\x00\x00\x00\x04\x00\x00\x00\x00" |
| "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" |
| "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" |
| "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" |
| "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" |
| "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" |
| "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" |
| "\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f" |
| "\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f" |
| "\xa0\x00\x00\x00\xa4\x00\x00\xa7\xa8\x00\x00\x00\x00\xad\x00\xaf" |
| "\xb0\x00\x00\x00\xb4\x00\x00\x00\xb8\x00\x00\x00\x00\x00\x00\x00" |
| "\xc0\xe0\x00\x00\xa1\xb1\x00\x00\x00\x00\x00\x00\xc8\xe8\x00\x00" |
| "\xd0\xf0\xaa\xba\x00\x00\xcc\xec\xca\xea\x00\x00\x00\x00\x00\x00" |
| "\x00\x00\xab\xbb\x00\x00\x00\x00\xa5\xb5\xcf\xef\x00\x00\xc7\xe7" |
| "\x00\x00\x00\x00\x00\x00\xd3\xf3\xa2\x00\x00\xa6\xb6\x00\x00\x00" |
| "\x00\x00\x00\x00\x00\xd1\xf1\x00\x00\x00\xbd\xbf\xd2\xf2\x00\x00" |
| "\x00\x00\x00\x00\x00\x00\xa3\xb3\x00\x00\x00\x00\x00\x00\x00\x00" |
| "\xa9\xb9\x00\x00\x00\x00\xac\xbc\xdd\xfd\xde\xfe\x00\x00\x00\x00" |
| "\x00\x00\xd9\xf9\x00\x00\x00\x00\x00\x00\x00\x00\x00\xae\xbe\x00" |
| "\x00\x00\x00\x00\x00\x00\x00\xb7\x00\x00\x00\x00\x00\x00\x00\x00" |
| "\x00\x00\x00\x00\x00\x00\x00\x00\x00\xff\x00\xb2\x00\x00\x00\x00" |
| "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" |
| "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" |
| "\x00\xc1\xc2\xc3\xc4\xc5\xc6\x00\x00\xc9\x00\xcb\x00\xcd\xce\x00" |
| "\x00\x00\x00\x00\xd4\xd5\xd6\xd7\xd8\x00\xda\xdb\xdc\x00\x00\xdf" |
| "\x00\xe1\xe2\xe3\xe4\xe5\xe6\x00\x00\xe9\x00\xeb\x00\xed\xee\x00" |
| "\x00\x00\x00\x00\xf4\xf5\xf6\xf7\xf8\x00\xfa\xfb\xfc\x00\x00\x00" |
| }; |
| |
| static unsigned short const xmlunicodetable_ISO8859_5 [128] = { |
| 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, |
| 0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, |
| 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, |
| 0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f, |
| 0x00a0, 0x0401, 0x0402, 0x0403, 0x0404, 0x0405, 0x0406, 0x0407, |
| 0x0408, 0x0409, 0x040a, 0x040b, 0x040c, 0x00ad, 0x040e, 0x040f, |
| 0x0410, 0x0411, 0x0412, 0x0413, 0x0414, 0x0415, 0x0416, 0x0417, |
| 0x0418, 0x0419, 0x041a, 0x041b, 0x041c, 0x041d, 0x041e, 0x041f, |
| 0x0420, 0x0421, 0x0422, 0x0423, 0x0424, 0x0425, 0x0426, 0x0427, |
| 0x0428, 0x0429, 0x042a, 0x042b, 0x042c, 0x042d, 0x042e, 0x042f, |
| 0x0430, 0x0431, 0x0432, 0x0433, 0x0434, 0x0435, 0x0436, 0x0437, |
| 0x0438, 0x0439, 0x043a, 0x043b, 0x043c, 0x043d, 0x043e, 0x043f, |
| 0x0440, 0x0441, 0x0442, 0x0443, 0x0444, 0x0445, 0x0446, 0x0447, |
| 0x0448, 0x0449, 0x044a, 0x044b, 0x044c, 0x044d, 0x044e, 0x044f, |
| 0x2116, 0x0451, 0x0452, 0x0453, 0x0454, 0x0455, 0x0456, 0x0457, |
| 0x0458, 0x0459, 0x045a, 0x045b, 0x045c, 0x00a7, 0x045e, 0x045f, |
| }; |
| |
| static unsigned char const xmltranscodetable_ISO8859_5 [48 + 6 * 64] = { |
| "\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" |
| "\x02\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" |
| "\x00\x00\x04\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" |
| "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" |
| "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" |
| "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" |
| "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" |
| "\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f" |
| "\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f" |
| "\xa0\x00\x00\x00\x00\x00\x00\xfd\x00\x00\x00\x00\x00\xad\x00\x00" |
| "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" |
| "\x00\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\x00\xae\xaf" |
| "\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf" |
| "\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf" |
| "\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf" |
| "\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef" |
| "\x00\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\x00\xfe\xff" |
| "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" |
| "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" |
| |