@i ../copyright.w

@* uni\_utf8\_to\_wc.

Convert multibyte |char| string to an |Uchar| string.  

We just increment the pointer |src| rather than keeping an index into
the string.  We do need an index (|ret_len|) into the |Uchar|
string |ret| that is to be returned however.

@c

#include <unicode.h>

Uchar*
uni_utf8_to_uc(Ufunctions *ufunct, char *src){
	Uchar *ret = NULL;
	Uchar *old_ret = NULL;
	int ret_len = 1;

	ufunct->error = U_ERR_NONE;

	ret = ufunct->realloc(ret, ret_len * sizeof(Uchar));
	if(!ret){
		ufunct->error = U_ERR_ALLOCATION_FAILED;
		return NULL;
	}

	ret[0] = 0xFEFF;

	while(*src){
		unsigned int src_short = *src;

		@<construct utf sixteen char@>;

	}

	old_ret = ret;
	ret = ufunct->realloc(ret, (ret_len + 1) * sizeof(Uchar));
	if(!ret){
		ufunct->error = U_ERR_ALLOCATION_FAILED;
		old_ret[ret_len - 1] = 0;
		return old_ret;
	}
	ret[ret_len] = 0;

	return ret;
}

@ 

Each sequence in a UTF-8 string starts with a byte that tells you how
many bytes long this character is.  The information is passed as:

0xxxxxxx - One byte (ie, value of first byte is less than 0x80).

110xxxxx - Two bytes (ie, value of first byte is less than 0xE0).

1110xxxx - Three bytes (ie, value of first byte is less than 0xF0).

11110xxx - Four bytes (ie, value of first byte is less than 0xF8).

111110xx - Five bytes (ie, value of first byte is less than 0xFC).

1111110x - Six bytes (ie, value of first byte is less than 0xFE).

We only go as far as four bytes as values above 0x0010FFFF are illegal
in UTF-16.

@<construct utf sixteen char@>=
ret_len++;
old_ret = ret;
ret = ufunct->realloc(ret, ret_len * sizeof(Uchar));
if(!ret){
	ufunct->error = U_ERR_ALLOCATION_FAILED;
	old_ret[ret_len - 2] = 0;
	return old_ret;
}
ret[ret_len - 1] = 0;


if(src_short < 0x20 && src_short != 0x9 && src_short != 0xA &&
		src_short != 0xD){
	@<illegal string@>;
}else if(src_short < 0x80){
	@<do single byte@>;
}else if(src_short < '\xE0'){
	@<do two bytes@>;
}else if(src_short < '\xF0'){
	@<do three bytes@>;
}else if(src_short < '\xF8'){
	@<do four bytes@>;
}else{
	@<illegal string@>;
}

@ 

The user needs to know where the error occured so we have to return
what we managed to process so far and set an error in the document.

@<illegal string@>=

ret[ret_len - 1] = 0;
ufunct->error = U_ERR_ILLEGAL_CHARACTER;

return ret;

@ 

The user needs to know where the error occured so we have to return
what we managed to process so far and set an error in the document.

@<corrupted string@>=

ret[ret_len - 1] = 0;
ufunct->error = U_ERR_MALFORMED_CHARACTER;

return ret;

@ 

Type conversion will automatically insert the contents of this |char|
into the 8 least significant bytes of |ret|.

The value of this sequence is between 0x00000000 and 0x0000007E.

@<do single byte@>=

ret[ret_len - 1] = (Uchar) *src++;

@ 

We need to bit shift the value left by six bits so that we have enough
space for the bites in the next byte in the sequence.

The value of this sequence is between 0x00000080 and 0x000007FF

@<do two bytes@>=

ret[ret_len - 1] = (Uchar) *src++ & 0x1F;
src_short = *src;
ret[ret_len - 1] <<= 6;

@<do last continuant byte@>;

@ 

Each continuant byte in the sequence is of the form 10xxxxxx, so that we
can distinguish it from other byte types.  ie, if this is a continuant
byte (as it should be) then it will have a value less than 0xC0.

@<do last continuant byte@>=
if(src_short < '\xC0'){
	ret[ret_len - 1] |= (Uchar) *src++ & 0x3F; 
}else{
	@<corrupted string@>;
}

@ 

The value of this sequence is between 0x00000800 and 0x0000FFFF - 
Excluding 0x0000D800 - 0x0000DFFF which are reserved for UTF-16
markers.  The values 0x0000FFFE and 0x0000FFFF are also not defined
in ISO/IEC 10646 and are illegal in unicode.

@<do three bytes@>=

ret[ret_len - 1] = (Uchar) *src++ & 0x0F;
src_short = *src;
ret[ret_len - 1] <<= 6;

@<do continuant byte@>;
@<do last continuant byte@>;

if(0xD7FF < ret[ret_len - 1] && ret[ret_len - 1] < 0xE000){
	@<illegal string@>;
}else if(ret[ret_len - 1] == 0xFFFE || ret[ret_len - 1] == 0xFFFF){
	@<illegal string@>;
}

@ 

Since there are more bytes in the sequence we need to do a bit shift
again.

@<do continuant byte@>=

if(src_short < '\xC0'){
	ret[ret_len - 1] |= (Uchar) *src++ & 0x3F;
	src_short = *src;
	ret[ret_len - 1] <<= 6;
}else{
	@<corrupted string@>;
}

@ 

The value of this sequence is between 0x00010000 and 0x001FFFFF.
That is, all values in this range are represented by two |Uchar|s.  
If the value is greater than
0x0010FFFF then this is an illegal value for UTF-16.

This assumes that |int| is atleast 32 bit.

@<do four bytes@>=
{
	unsigned int ucs = 0;

	@<construct UCS character@>;

	if(ucs > 0x10FFFF){
		@<corrupted string@>;
	}

ret[ret_len - 1] = ((ucs - 0x010000) / 0x0400) + 0xD800;

ret_len++;
old_ret = ret;
ret = ufunct->realloc(ret, ret_len * sizeof(Uchar));
if(!ret){
	ufunct->error = U_ERR_ALLOCATION_FAILED;
	old_ret[ret_len - 2] = 0;
	return old_ret;
}
ret[ret_len - 1] = 0;

ret[ret_len - 1] = ((ucs - 0x010000) % 0x0400) + 0xDC00;

}

@ 

We need to construct the UCS-4 character for easy conversion to UTF-16.

@<construct UCS character@>=

ucs = (unsigned int) *src++ & 0x07;
src_short = *src;
ucs <<= 6;

if(src_short < '\xC0'){
	ucs |= (unsigned int) *src++ & 0x3F;
	ucs <<= 6;
}else{
	@<corrupted string@>;
}

if(src_short < '\xC0'){
	ucs |= (unsigned int) *src++ & 0x3F;
	ucs <<= 6;
}else{
	@<corrupted string@>;
}

if(src_short < '\xC0'){
	ucs |= (unsigned int) *src++ & 0x3F; 
}else{
	@<corrupted string@>;
}

@* Index.