277 lines
5.9 KiB
C
277 lines
5.9 KiB
C
#ifndef lint
|
|
static char *rcsid = "$Id: utf8.c,v 1.1 2003/06/04 00:26:44 marka Exp $";
|
|
#endif
|
|
|
|
/*
|
|
* Copyright (c) 2000 Japan Network Information Center. All rights reserved.
|
|
*
|
|
* By using this file, you agree to the terms and conditions set forth bellow.
|
|
*
|
|
* LICENSE TERMS AND CONDITIONS
|
|
*
|
|
* The following License Terms and Conditions apply, unless a different
|
|
* license is obtained from Japan Network Information Center ("JPNIC"),
|
|
* a Japanese association, Kokusai-Kougyou-Kanda Bldg 6F, 2-3-4 Uchi-Kanda,
|
|
* Chiyoda-ku, Tokyo 101-0047, Japan.
|
|
*
|
|
* 1. Use, Modification and Redistribution (including distribution of any
|
|
* modified or derived work) in source and/or binary forms is permitted
|
|
* under this License Terms and Conditions.
|
|
*
|
|
* 2. Redistribution of source code must retain the copyright notices as they
|
|
* appear in each source code file, this License Terms and Conditions.
|
|
*
|
|
* 3. Redistribution in binary form must reproduce the Copyright Notice,
|
|
* this License Terms and Conditions, in the documentation and/or other
|
|
* materials provided with the distribution. For the purposes of binary
|
|
* distribution the "Copyright Notice" refers to the following language:
|
|
* "Copyright (c) 2000-2002 Japan Network Information Center. All rights reserved."
|
|
*
|
|
* 4. The name of JPNIC may not be used to endorse or promote products
|
|
* derived from this Software without specific prior written approval of
|
|
* JPNIC.
|
|
*
|
|
* 5. Disclaimer/Limitation of Liability: THIS SOFTWARE IS PROVIDED BY JPNIC
|
|
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
|
* PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL JPNIC BE LIABLE
|
|
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
|
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
|
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
|
|
* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
|
|
* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
|
|
* OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
|
|
* ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
|
|
*/
|
|
|
|
#include <config.h>
|
|
|
|
#include <stddef.h>
|
|
|
|
#include <idn/assert.h>
|
|
#include <idn/logmacro.h>
|
|
#include <idn/utf8.h>
|
|
#include <idn/debug.h>
|
|
|
|
#define UTF8_WIDTH(c) \
|
|
(((c) < 0x80) ? 1 : \
|
|
((c) < 0xc0) ? 0 : \
|
|
((c) < 0xe0) ? 2 : \
|
|
((c) < 0xf0) ? 3 : \
|
|
((c) < 0xf8) ? 4 : \
|
|
((c) < 0xfc) ? 5 : \
|
|
((c) < 0xfe) ? 6 : 0)
|
|
|
|
#define VALID_CONT_BYTE(c) (0x80 <= (c) && (c) < 0xc0)
|
|
|
|
int
|
|
idn_utf8_mblen(const char *s) {
|
|
int c = *(unsigned char *)s;
|
|
|
|
assert(s != NULL);
|
|
|
|
#if 0
|
|
TRACE(("idn_utf8_mblen(s=<%s>)\n", idn__debug_hexstring(s, 6)));
|
|
#endif
|
|
|
|
return UTF8_WIDTH(c);
|
|
}
|
|
|
|
int
|
|
idn_utf8_getmb(const char *s, size_t len, char *buf) {
|
|
/* buf must be at least 7-bytes long */
|
|
const unsigned char *p = (const unsigned char *)s;
|
|
unsigned char *q = (unsigned char *)buf;
|
|
int width = UTF8_WIDTH(*p);
|
|
int w;
|
|
|
|
assert(s != NULL);
|
|
|
|
#if 0
|
|
TRACE(("idn_utf8_getmb(s=<%s>,len=%d)\n",
|
|
idn__debug_hexstring(s, 6), len));
|
|
#endif
|
|
|
|
if (width == 0 || len < width)
|
|
return (0);
|
|
|
|
/* Copy the first byte. */
|
|
*q++ = *p++;
|
|
|
|
/* .. and the rest. */
|
|
w = width;
|
|
while (--w > 0) {
|
|
if (!VALID_CONT_BYTE(*p))
|
|
return (0);
|
|
*q++ = *p++;
|
|
}
|
|
return (width);
|
|
}
|
|
|
|
int
|
|
idn_utf8_getwc(const char *s, size_t len, unsigned long *vp) {
|
|
unsigned long v;
|
|
unsigned long min;
|
|
const unsigned char *p = (const unsigned char *)s;
|
|
int c;
|
|
int width;
|
|
int rest;
|
|
|
|
assert(s != NULL);
|
|
|
|
#if 0
|
|
TRACE(("idn_utf8_getwc(s=<%s>,len=%d)\n",
|
|
idn__debug_hexstring(s, 10), len));
|
|
#endif
|
|
|
|
c = *p++;
|
|
width = UTF8_WIDTH(c);
|
|
|
|
switch (width) {
|
|
case 0:
|
|
return (0);
|
|
case 1:
|
|
v = c;
|
|
min = 0;
|
|
break;
|
|
case 2:
|
|
v = c & 0x1f;
|
|
min = 0x80;
|
|
break;
|
|
case 3:
|
|
v = c & 0xf;
|
|
min = 0x800;
|
|
break;
|
|
case 4:
|
|
v = c & 0x7;
|
|
min = 0x10000;
|
|
break;
|
|
case 5:
|
|
v = c & 3;
|
|
min = 0x200000;
|
|
break;
|
|
case 6:
|
|
v = c & 1;
|
|
min = 0x4000000;
|
|
break;
|
|
default:
|
|
FATAL(("idn_utf8_getint: internal error\n"));
|
|
return (0);
|
|
}
|
|
|
|
if (len < width)
|
|
return (0);
|
|
|
|
rest = width - 1;
|
|
while (rest-- > 0) {
|
|
if (!VALID_CONT_BYTE(*p))
|
|
return (0);
|
|
v = (v << 6) | (*p & 0x3f);
|
|
p++;
|
|
}
|
|
|
|
if (v < min)
|
|
return (0);
|
|
|
|
*vp = v;
|
|
return (width);
|
|
}
|
|
|
|
int
|
|
idn_utf8_putwc(char *s, size_t len, unsigned long v) {
|
|
unsigned char *p = (unsigned char *)s;
|
|
int mask;
|
|
int off;
|
|
int l;
|
|
|
|
assert(s != NULL);
|
|
|
|
#if 0
|
|
TRACE(("idn_utf8_putwc(v=%lx)\n", v));
|
|
#endif
|
|
|
|
if (v < 0x80) {
|
|
mask = 0;
|
|
l = 1;
|
|
} else if (v < 0x800) {
|
|
mask = 0xc0;
|
|
l = 2;
|
|
} else if (v < 0x10000) {
|
|
mask = 0xe0;
|
|
l = 3;
|
|
} else if (v < 0x200000) {
|
|
mask = 0xf0;
|
|
l = 4;
|
|
} else if (v < 0x4000000) {
|
|
mask = 0xf8;
|
|
l = 5;
|
|
} else if (v < 0x80000000) {
|
|
mask = 0xfc;
|
|
l = 6;
|
|
} else {
|
|
return (0);
|
|
}
|
|
|
|
if (len < l)
|
|
return (0);
|
|
|
|
off = 6 * (l - 1);
|
|
*p++ = (v >> off) | mask;
|
|
mask = 0x80;
|
|
while (off > 0) {
|
|
off -= 6;
|
|
*p++ = ((v >> off) & 0x3f) | mask;
|
|
}
|
|
return l;
|
|
}
|
|
|
|
int
|
|
idn_utf8_isvalidchar(const char *s) {
|
|
unsigned long dummy;
|
|
|
|
TRACE(("idn_utf8_isvalidchar(s=<%s>)\n",
|
|
idn__debug_hexstring(s, 6)));
|
|
|
|
return (idn_utf8_getwc(s, 6, &dummy) > 0);
|
|
}
|
|
|
|
int
|
|
idn_utf8_isvalidstring(const char *s) {
|
|
unsigned long dummy;
|
|
int width;
|
|
|
|
assert(s != NULL);
|
|
|
|
TRACE(("idn_utf8_isvalidstring(s=<%s>)\n",
|
|
idn__debug_hexstring(s, 20)));
|
|
|
|
while (*s != '\0') {
|
|
width = idn_utf8_getwc(s, 6, &dummy);
|
|
if (width == 0)
|
|
return (0);
|
|
s += width;
|
|
}
|
|
return (1);
|
|
}
|
|
|
|
char *
|
|
idn_utf8_findfirstbyte(const char *s, const char *known_top) {
|
|
const unsigned char *p = (const unsigned char *)s;
|
|
const unsigned char *t = (const unsigned char *)known_top;
|
|
|
|
assert(s != NULL && known_top != NULL && known_top <= s);
|
|
|
|
TRACE(("idn_utf8_findfirstbyte(s=<%s>)\n",
|
|
idn__debug_hexstring(s, 8)));
|
|
|
|
while (p >= t) {
|
|
if (!VALID_CONT_BYTE(*p))
|
|
break;
|
|
p--;
|
|
}
|
|
if (p < t || UTF8_WIDTH(*p) == 0)
|
|
return (NULL);
|
|
|
|
return ((char *)p);
|
|
}
|