bind9/contrib/idn/mdnkit/lib/utf6.c

#ifndef lint
static char *rcsid = "$Id: utf6.c,v 1.1 2001/06/09 00:30:30 tale Exp $";
#endif

/*
 * Copyright (c) 2000 Japan Network Information Center.  All rights reserved.
 *
 * By using this file, you agree to the terms and conditions set forth bellow.
 *
 * 			LICENSE TERMS AND CONDITIONS
 *
 * The following License Terms and Conditions apply, unless a different
 * license is obtained from Japan Network Information Center ("JPNIC"),
 * a Japanese association, Fuundo Bldg., 1-2 Kanda Ogawamachi, Chiyoda-ku,
 * Tokyo, Japan.
 *
 * 1. Use, Modification and Redistribution (including distribution of any
 *    modified or derived work) in source and/or binary forms is permitted
 *    under this License Terms and Conditions.
 *
 * 2. Redistribution of source code must retain the copyright notices as they
 *    appear in each source code file, this License Terms and Conditions.
 *
 * 3. Redistribution in binary form must reproduce the Copyright Notice,
 *    this License Terms and Conditions, in the documentation and/or other
 *    materials provided with the distribution.  For the purposes of binary
 *    distribution the "Copyright Notice" refers to the following language:
 *    "Copyright (c) Japan Network Information Center.  All rights reserved."
 *
 * 4. Neither the name of JPNIC may be used to endorse or promote products
 *    derived from this Software without specific prior written approval of
 *    JPNIC.
 *
 * 5. Disclaimer/Limitation of Liability: THIS SOFTWARE IS PROVIDED BY JPNIC
 *    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 *    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
 *    PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL JPNIC BE LIABLE
 *    FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 *    CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 *    SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
 *    BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
 *    WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
 *    OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
 *    ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
 *
 * 6. Indemnification by Licensee
 *    Any person or entities using and/or redistributing this Software under
 *    this License Terms and Conditions shall defend indemnify and hold
 *    harmless JPNIC from and against any and all judgements damages,
 *    expenses, settlement liabilities, cost and other liabilities of any
 *    kind as a result of use and redistribution of this Software or any
 *    claim, suite, action, litigation or proceeding by any third party
 *    arising out of or relates to this License Terms and Conditions.
 *
 * 7. Governing Law, Jurisdiction and Venue
 *    This License Terms and Conditions shall be governed by and and
 *    construed in accordance with the law of Japan. Any person or entities
 *    using and/or redistributing this Software under this License Terms and
 *    Conditions hereby agrees and consent to the personal and exclusive
 *    jurisdiction and venue of Tokyo District Court of Japan.
 */

#include <config.h>

#include <stddef.h>
#include <stdlib.h>
#include <string.h>

#include <mdn/result.h>
#include <mdn/assert.h>
#include <mdn/logmacro.h>
#include <mdn/converter.h>
#include <mdn/utf8.h>
#include <mdn/debug.h>
#include <mdn/utf6.h>
#include <mdn/ace.h>
#include <mdn/util.h>

#ifndef MDN_UTF6_PREFIX
#define MDN_UTF6_PREFIX		"wq--"
#endif

#define UTF6_SAME_BYTE_MASK	0x00ff
#define UTF6_SAME_NIBBLE_MASK	0x0fff
#define UTF6_PLAIN_MASK		0xffff

#define UTF6_BUF_SIZE		128		/* more than enough */

/*
 * Compression type.
 */
enum {
	same_byte_mode,		/* the most significant byte of all non
				   '-' characters is the same value */
	same_nibble_mode,	/* the most significant nibble of all non
				   '-' characters is the same value */
	plain_mode		/* not compressed */
};

static mdn_result_t	utf6_decode(const char *from, size_t fromlen,
				    char *to, size_t tolen);
static mdn_result_t	utf6_decode_utf16(const char *from, size_t fromlen,
					  unsigned short *buf, size_t *lenp);
static mdn_result_t	utf6_decode_vlhex(const char *from, size_t len,
					  size_t *reslen,
					  unsigned short *value);
static mdn_result_t	utf6_encode(const char *from, size_t fromlen,
				    char *to, size_t tolen);
static mdn_result_t	utf6_encode_utf16(const unsigned short *p,
					  size_t len, char *to, size_t tolen,
					  int compress);
static mdn_result_t	utf6_encode_vlhex(unsigned short value, char *to,
					  size_t tolen, size_t *reslen);
static int		get_compress_mode(const unsigned short *p, size_t len);

static mdn__ace_t utf6_ctx = {
	mdn__ace_prefix,
	MDN_UTF6_PREFIX,
	utf6_encode,
	utf6_decode,
};

/* ARGSUSED */
mdn_result_t
mdn__utf6_open(mdn_converter_t ctx, mdn_converter_dir_t dir, void **privdata) {
	return (mdn_success);
}

/* ARGSUSED */
mdn_result_t
mdn__utf6_close(mdn_converter_t ctx, void *privdata, mdn_converter_dir_t dir) {
	return (mdn_success);
}

mdn_result_t
mdn__utf6_convert(mdn_converter_t ctx, void *privdata, mdn_converter_dir_t dir,
		    const char *from, char *to, size_t tolen)
{
	mdn_result_t r;

	assert(ctx != NULL &&
	       (dir == mdn_converter_l2u || dir == mdn_converter_u2l));

	TRACE(("mdn__utf6_convert(dir=%s,from=\"%s\")\n",
	       dir == mdn_converter_l2u ? "l2u" : "u2l",
	       mdn_debug_xstring(from, 20)));

	r = mdn__ace_convert(&utf6_ctx, dir, from, to, tolen);
	if (r != mdn_success)
		return (r);

	DUMP(("mdn__utf6_convert: \"%s\"\n", mdn_debug_xstring(to, 70)));

	return (r);
}

static mdn_result_t
utf6_decode(const char *from, size_t fromlen, char *to, size_t tolen) {
	unsigned short *buf;
	unsigned short local_buf[UTF6_BUF_SIZE];
	size_t len, reslen;
	mdn_result_t r;

	/*
	 * Allocate sufficient buffer.
	 */
	if (fromlen > UTF6_BUF_SIZE) {
		if ((buf = malloc(sizeof(*buf) * fromlen)) == NULL)
			return (mdn_nomemory);
	} else {
		/* Use local buffer. */
		buf = local_buf;
	}

	/*
	 * Decode base32 and decompress.
	 */
	r = utf6_decode_utf16(from, fromlen, buf, &len);
	if (r != mdn_success)
		goto ret;

	/*
	 * Now 'buf' holds the decompressed string, which must contain
	 * UTF-16 characters.  Convert them into UTF-8.
	 */
	r = mdn_util_utf16toutf8(buf, len, to, tolen, &reslen);
	if (r != mdn_success)
		goto ret;

	/*
	 * Terminate with NUL.
	 */
	if (tolen <= reslen) {
		r = mdn_buffer_overflow;
		goto ret;
	}

	to += reslen;
	*to = '\0';
	tolen -= reslen;

	r = mdn_success;

ret:
	if (buf != local_buf)
		free(buf);
	return (r);
}

static mdn_result_t
utf6_decode_utf16(const char *from, size_t fromlen,
		  unsigned short *buf, size_t *lenp)
{
	mdn_result_t r;
	unsigned short value;
	unsigned short cpart;
	unsigned short vmax;
	size_t len;
	size_t reslen;

	/*
	 * Decode Base32 and put the result bytes to 'buf'.
	 * Since decoded string will be shorter in length, and
	 * the caller allocated 'buf' so that its length is not
	 * less than 'fromlen', we don't have to worry about overflow.
	 */

	if (fromlen <= 0)
	    return mdn_success;

	switch (*from) {
	case 'y':
	case 'Y':
		/*
		 * same_byte_mode.
		 */
		fromlen--;
		from++;
		r = utf6_decode_vlhex(from, fromlen, &reslen, &value);
		if (r != mdn_success)
			return (mdn_invalid_encoding);
		from += reslen;
		fromlen -= reslen;

		cpart = value * 0x0100;
		vmax = 0x00ff;
		break;

	case 'z':
	case 'Z':
		/*
		 * same_nibble_mode.
		 */
		fromlen--;
		from++;
		r = utf6_decode_vlhex(from, fromlen, &reslen, &value);
		if (r != mdn_success)
			return (mdn_invalid_encoding);
		from += reslen;
		fromlen -= reslen;

		cpart = value * 0x1000;
		vmax = 0x0fff;
		break;

	default:
		/*
		 * plain_mode.
		 */
		cpart = 0x0000;
		vmax = 0xffff;
		break;
	}


	for (len = 0; fromlen > 0; len++) {
		if (*from == '-') {
			*buf++ = '-';
			from++;
			fromlen--;
		} else {
			r = utf6_decode_vlhex(from, fromlen, &reslen, &value);
			if (r != mdn_success)
				return (mdn_invalid_encoding);
			if (value > vmax)
				return (mdn_invalid_encoding);
			*buf++ = cpart + value;
			from += reslen;
			fromlen -= reslen;
		}
	}

	*buf = '\0';
	*lenp = len;
	return (mdn_success);
}

static mdn_result_t
utf6_decode_vlhex(const char *from, size_t len, size_t *reslen,
		  unsigned short *value) {
	unsigned short v;
	int i;

	/*
	 * Decode the first character of a variable length HEX string.
	 * The character must be in set of [ghijklmnopqrstuv].
	 */
	if (len <= 0)
		return (mdn_invalid_encoding);

	if ('G' <= *from && *from <= 'V')
		v = *from - 'G';
	else if ('g' <= *from && *from <= 'v')
		v = *from - 'g';
	else
		return (mdn_invalid_encoding);
	from++;
	len--;
	i = 1;

	/*
	 * Decode the rest characters of a variable length HEX string.
	 * The every character must be in set of [0123456789abcdef].
	 */
	for (;;) {
		if (len <= 0)
			break;
		if ('0' <= *from && *from <= '9')
			v = (v << 4) + (*from - '0');
		else if ('A' <= *from && *from <= 'F')
			v = (v << 4) + 0x0a + (*from - 'A');
		else if ('a' <= *from && *from <= 'f')
			v = (v << 4) + 0x0a + (*from - 'a');
		else
			break;
		from++;
		len--;
		i++;
	}

	*value = v;
	*reslen = i;
	return (mdn_success);
}

static mdn_result_t
utf6_encode(const char *from, size_t fromlen, char *to, size_t tolen) {
	unsigned short *buf;
	unsigned short local_buf[UTF6_BUF_SIZE];	/* UTF-16 */
	mdn_result_t r;
	size_t buflen, len;

	/*
	 * Convert to UTF-16.
	 */
	buf = local_buf;
	buflen = UTF6_BUF_SIZE;
	for (;;) {
		r = mdn_util_utf8toutf16(from, fromlen,
					 buf, buflen, &len);
		if (r == mdn_buffer_overflow) {
			buflen *= 2;
			if (buf == local_buf)
				buf = malloc(sizeof(*buf) * buflen);
			else
				buf = realloc(buf, sizeof(*buf) * buflen);
			if (buf == NULL)
				return (mdn_nomemory);
		} else if (r == mdn_success) {
			break;
		} else {
			goto ret;
		}
	}

	/*
	 * Compress, encode in base-32 and output.
	 */
	r = utf6_encode_utf16(buf, len, to, tolen,
			      get_compress_mode(buf, len));

ret:
	if (buf != local_buf)
		free(buf);
	return (r);
}

static mdn_result_t
utf6_encode_utf16(const unsigned short *p, size_t len,
		  char *to, size_t tolen, int compress_mode)
{
	mdn_result_t r;
	unsigned short mask;
	size_t reslen;
	int i;

	if (len <= 0)
	    return mdn_success;

	switch (compress_mode) {
	case same_byte_mode:
		mask = UTF6_SAME_BYTE_MASK;

		if (tolen < 1)
			return (mdn_buffer_overflow);
		*to++ = 'y';
		tolen--;
		r = utf6_encode_vlhex((p[0] >> 8) & 0x00ff, to, tolen,
				      &reslen);
		if (r != mdn_success)
			return (r);
		to += reslen;
		tolen -= reslen;

		break;

	case same_nibble_mode:
		mask = UTF6_SAME_NIBBLE_MASK;

		if (tolen < 1)
			return (mdn_buffer_overflow);
		*to++ = 'z';
		tolen--;
		r = utf6_encode_vlhex((p[0] >> 4) & 0x0fff, to, tolen,
				      &reslen);
		if (r != mdn_success)
			return (r);
		to += reslen;
		tolen -= reslen;

		break;

	default:
		mask = UTF6_PLAIN_MASK;
		break;
	}

	for (i = 0; i < len; i++) {
		if (p[i] == '-') {
			if (tolen < 1)
				return (mdn_buffer_overflow);
			*to++ = '-';
			tolen--;

		} else {
			r = utf6_encode_vlhex(p[i] & mask, to, tolen, &reslen);
			if (r != mdn_success)
				return (r);
			to += reslen;
			tolen -= reslen;
		}
	}

	if (tolen <= 0)
		return (mdn_buffer_overflow);

	*to = '\0';
	return (mdn_success);
}

static mdn_result_t
utf6_encode_vlhex(unsigned short value, char *to, size_t tolen,
		  size_t *reslen) {

	static const char *vlhex_string1 = "ghijklmnopqrstuv";
	static const char *vlhex_string2 = "0123456789abcdef";

	/*
	 * Encode an integer in the range of 0x000 - 0xffff as variable
	 * length HEX string.
	 */
	if (value <= 0x000f) {
		if (tolen < 1)
			return (mdn_buffer_overflow);
		*to++ = vlhex_string1[ value        & 0x0f];
		*reslen = 1;

	} else if (value <= 0x00ff) {
		if (tolen < 2)
			return (mdn_buffer_overflow);
		*to++ = vlhex_string1[(value >> 4)  & 0x0f];
		*to++ = vlhex_string2[ value        & 0x0f];
		*reslen = 2;

	} else if (value <= 0x0fff) {
		if (tolen < 3)
			return (mdn_buffer_overflow);
		*to++ = vlhex_string1[(value >> 8)  & 0x0f];
		*to++ = vlhex_string2[(value >> 4)  & 0x0f];
		*to++ = vlhex_string2[ value        & 0x0f];
		*reslen = 3;

	} else {
		if (tolen < 4)
			return (mdn_buffer_overflow);
		*to++ = vlhex_string1[(value >> 12) & 0x0f];
		*to++ = vlhex_string2[(value >> 8)  & 0x0f];
		*to++ = vlhex_string2[(value >> 4)  & 0x0f];
		*to++ = vlhex_string2[ value        & 0x0f];
		*reslen = 4;
	}

	return (mdn_success);
}

static int
get_compress_mode(const unsigned short *p, size_t len) {
	int non_hyphens = 0;
	unsigned short same_bytes = 0;
	unsigned short same_nibbles = 0;
	int i;

	if (len <= 0)
		return plain_mode;

	for (i = 0; i < len; i++) {
		if (p[i] != '-') {
			non_hyphens++;
			if ((p[0] & 0xff00) == (p[i] & 0xff00))
				same_bytes++;
			else if ((p[0] & 0xf000) == (p[i] & 0xf000))
				same_nibbles++;
		}
	}

	if (non_hyphens < 2) {
		/*
		 * The number of non '-' characters is less than 2.
		 */
		return plain_mode;
	} else if (same_bytes == non_hyphens) {
		/*
		 * The same most significant byte of the every non '-'
		 * character is the same value.
		 */
		return same_byte_mode;
	} else if (same_nibbles == non_hyphens) {
		/*
		 * The same most significant nibble of the every non '-'
		 * character is the same value.
		 */
		return same_nibble_mode;
	} else {
		/*
		 * Not matched above.
		 */
		return plain_mode;
	}

	/* Not reached */
}