@i ../copyright.w

@* uni\_wc\_to\_utf8.

Converts a UTF-16 string to UTF-8 format.

@c

#include <unicode.h>

char*
uni_uc_to_utf8(Ufunctions *ufunct, Uchar* src){
	char *ret = NULL;
	char *old_ret = NULL;
	int ret_len = 0;

	ufunct->error = U_ERR_NONE;

	if(!src)
		return NULL;

	src++; /* ignore BOM */

	@<construct utf8 string@>;

	old_ret = ret;
	ret = ufunct->realloc(ret, (ret_len + 1) * sizeof(char));
	if(!ret){
		ufunct->error = U_ERR_ALLOCATION_FAILED;
		old_ret[ret_len - 1] = '\0';
		return old_ret;
	}

	ret[ret_len] = '\0';

	return ret;
}

@ 

If |0xE000 < *src > 0xDC00| then this is a continuant character and is
out of place if we find it in this part of the string.

@<construct utf8 string@>=

while(*src){

	ret_len++;
	old_ret = ret;
	ret = ufunct->realloc(ret, ret_len * sizeof(char));
	if(!ret){
		ufunct->error = U_ERR_ALLOCATION_FAILED;
		old_ret[ret_len - 2] = '\0';
		return old_ret;
	}

	if(*src < 0x20 && *src != 0x9 && *src !=0xA && *src != 0xD){
		@<illegal string@>;
	}else if(*src < 0x80){
		@<one byte sequence@>;
	}else if(*src < 0x0800){
		@<two byte sequence@>;
	}else if(*src < 0xD800){
		@<three byte sequence@>;
	}else if(*src < 0xDC00){
		@<four byte sequence@>;
	}else if(*src < 0xE000){
		@<corrupted string@>;
	}else if(*src < 0x110000){
		@<three byte sequence@>;
	}else{
		@<illegal string@>
	}
}

@ 

This is the easiest conversion, we just set this |char| equal to |src|.

@<one byte sequence@>=

ret[ret_len - 1] = *src++;

@ 

We have to use |tmp_src| here as the user probably won't want us to
bit shift his string.

@<two byte sequence@>=
{
	Uchar tmp_src = *src++;

	ret[ret_len - 1] = tmp_src & 0x1F;
	ret[ret_len - 1] |= 0xC0;

	ret_len++;
	old_ret = ret;
	ret = ufunct->realloc(ret, ret_len * sizeof(char));
	if(!ret){
		ufunct->error = U_ERR_ALLOCATION_FAILED;
		old_ret[ret_len - 2] = '\0';
		return old_ret;
	}
	tmp_src >>= 5;

	ret[ret_len - 1] = tmp_src & 0x3F;
	ret[ret_len - 1] |= 0x80;
}

@ 

Same as the others.

@<three byte sequence@>=
{
	Uchar tmp_src = *src++;

	ret[ret_len - 1] = tmp_src & 0x0F;
	ret[ret_len - 1] |= 0xE0;

	ret_len++;
	old_ret = ret;
	ret = ufunct->realloc(ret, ret_len * sizeof(char));
	if(!ret){
		ufunct->error = U_ERR_ALLOCATION_FAILED;
		old_ret[ret_len - 2] = '\0';
		return old_ret;
	}
	tmp_src >>= 4;

	ret[ret_len - 1] = tmp_src & 0x3F;
	ret[ret_len - 1] |= 0x80;

	ret_len++;
	old_ret = ret;
	ret = ufunct->realloc(ret, ret_len * sizeof(char));
	if(!ret){
		ufunct->error = U_ERR_ALLOCATION_FAILED;
		old_ret[ret_len - 2] = '\0';
		return old_ret;
	}
	tmp_src >>= 6;

	ret[ret_len - 1] = tmp_src & 0x3F;
	ret[ret_len - 1] |= 0x80;
}

@ 

Every legal four byte sequence takes up two |Uchar|, so we need to
construct a UCS-4 character to make it easier to convert it to UTF-8.
We preallocate the four bytes needed as this allows us to create the
UTF-8 char in reverse.

This assumes that |int| is atleast 32 bit.

@<four byte sequence@>=
{
	unsigned int ucs;

	if(*(src + 1) < 0xDC00 || *(src + 1) > 0xDFFF){
		@<corrupted string@>;
	}

	ucs = ((*src++ - 0xD800) * 0x0400) + (*src++ - 0xDC00) + 0x010000;

	ret_len += 4;
	old_ret = ret;
	ret = ufunct->realloc(ret, ret_len * sizeof(char));
	if(!ret){
		ufunct->error = U_ERR_ALLOCATION_FAILED;
		old_ret[ret_len - 5] = '\0';
		return old_ret;
	}

	ret[ret_len - 1] = ucs & 0x3F;
	ret[ret_len - 1] |= 0x80;
	ucs >>= 6;

	ret[ret_len - 2] = ucs & 0x3F;
	ret[ret_len - 2] |= 0x80;
	ucs >>= 6;

	ret[ret_len - 3] = ucs & 0x3F;
	ret[ret_len - 3] |= 0x80;
	ucs >>= 6;

	ret[ret_len - 4] = ucs & 0x07;
	ret[ret_len - 4] |= 0xF0;

}

@ 

The user needs to know where the error occured so we return what we
have been able to process and set an error on the document.

@<illegal string@>=

ret[ret_len - 1] = '\0';
ufunct->error = U_ERR_ILLEGAL_CHARACTER;

return ret;

@ 

The user needs to know where the error occured so we return what we
have been able to process and set an error on the document.

@<corrupted string@>=

ret[ret_len - 1] = '\0';
ufunct->error = U_ERR_MALFORMED_CHARACTER;

return ret;

@* Index.
