@i ../copyright.w

@* unicode.

This library is a set of utilities for handling unicode in C.

@c

#ifndef _UNICODE_H_
#define _UNICODE_H_

#ifdef __cplusplus
extern "C" {
#endif

#include <stdlib.h>
#include <sys/types.h>

typedef u_int16_t Uchar; 



/* Win32 magic */
#if defined(_WIN32) && !defined(STATIC)
# if defined(BUILD)
#  define UNI_DL_IMPORT __declspec(dllexport)
# else
#  define UNI_DL_IMPORT __declspec(dllimport)
# endif
#else
# define UNI_DL_IMPORT
#endif


enum __Uerror {
U_ERR_ILLEGAL_CHARACTER,@/
U_ERR_MALFORMED_CHARACTER,@/
U_ERR_ALLOCATION_FAILED,@/
U_ERR_NONE
};

typedef enum __Uerror Uerror;

@ 

This library needs to be usable in apache, so we have a structure telling
it what calloc, etc functions to use.  If you are using apache then you
would define your own calloc, etc functions that work on the appropriate
pool and set the members of this structure to point to these functions.

Of course, you should set the free member to point to a function that
does nothing in apache, but I've included this member for other
applications which might have a use for it.

If you aren't using apache then there is a default |Ufunctions| that
points to the regular calloc, etc.  To make things easier there are
two APIs.  The one with the prefix |u_| automatically passes the default
|Ufunctions| so you never have to think about it.  The one with the
prefix |uni_| requires you to pass your |Ufunctions| wherever 
necessary.

@c

typedef struct __Ufunctions {
  void* (*calloc)(size_t, size_t);
  void* (*realloc)(void*, size_t);
  void (*free)(void*);
  Uerror error;
} Ufunctions;

#ifdef _UNICODE_MAIN_
UNI_DL_IMPORT Ufunctions __ufunctions = { calloc, realloc, free, U_ERR_NONE};
#define Uerror __ufunctions.error
#else
UNI_DL_IMPORT extern Ufunctions __ufunctions;
#endif


@ 

Create a unicode string from a series of 16 bit numbers, including the
|NULL| terminating character.

The intention of this function is that your editor should have a mode
that allows you to enter a unicode string from your keyboard and the
macro will automatically insert the appropriate hexadecimal numbers
and commas between each number.  Therefore making the creation
of strings as easy in unicode as it is for 8 bit strings.

In emacs for example you might write a patch for C mode where the
key sequence C-c C-z prompts the user for a string.  The macro
then creates a comment with the entered string in readable form
and a true form for the compiler.  For example, if you had typed
the line


|Uchar foo[] = {|


and then typed


C-c C-z "Hi there!"


The macro would produce


/* |Uchar foo[] = {"Hi there!" |*/

|Uchar foo[] = {0x48, 0x69, 0x20, 0x74, 0x68, 0x65, 0x72, 0x65, 0x21, 0|


@c

UNI_DL_IMPORT Uchar* uni_make(Ufunctions*, ...);

#ifdef __GNUC__
#define u_make(args...) (uni_make(&__ufunctions , ## args))
#endif

@* Printing functions.

This is just to create a uniform API.

@c

#ifdef __GNUC__
#define u_printf(fmt, args... ) (wprintf((fmt) , ## args ))
#define uni_printf(fmt, args... ) (wprintf((fmt) , ## args ))
#endif


@* Character functions

@

|Uchar| versions of is*() functions

@c
#define uni_isalnum(c) iswalnum((wchar_t)c)
#define uni_isalpha(c) iswalpha((wchar_t)c)
#define uni_isascii(c) iswascii((wchar_t)c)
#define uni_iscntrl(c) iswcntrl((wchar_t)c)
#define uni_isdigit(c) iswdigit((wchar_t)c)
#define uni_isgraph(c) iswgraph((wchar_t)c)
#define uni_islower(c) iswlower((wchar_t)c)
#define uni_isprint(c) iswprint((wchar_t)c)
#define uni_ispunct(c) iswpunct((wchar_t)c)
#define uni_isspace(c) iswspace((wchar_t)c)
#define uni_isupper(c) iswupper((wchar_t)c)
#define uni_isxdigit(c) iswxdigit((wchar_t)c)

#define u_isalnum(c) uni_isalnum(c) 
#define u_isalpha(c) uni_isalpha(c) 
#define u_isascii(c) uni_isascii(c) 
#define u_iscntrl(c) uni_iscntrl(c) 
#define u_isdigit(c) uni_isdigit(c) 
#define u_isgraph(c) uni_isgraph(c) 
#define u_islower(c) uni_islower(c) 
#define u_isprint(c) uni_isprint(c) 
#define u_ispunct(c) uni_ispunct(c) 
#define u_isspace(c) uni_isspace(c) 
#define u_isupper(c) uni_isupper(c) 
#define u_isxdigit(c) uni_isxdigit(c)

@

|Uchar| versions of to* functions

@c
/*  fix problem with no wchar on FreeBSD (other oses?)*/

#ifdef HAVE_TOWLOWER
#define uni_tolower(c) ((Uchar)towlower((wchar_t)c))
#define uni_toupper(c) ((Uchar)towupper((wchar_t)c))
#else
#define uni_tolower(c) ((Uchar)tolower((wchar_t)c))
#define uni_toupper(c) ((Uchar)toupper((wchar_t)c))
#endif

#define uni_toascii(c) ((Uchar)c)

#define u_tolower(c) uni_tolower(c)
#define u_toupper(c) uni_toupper(c)
#define u_toascii(c) uni_toascii(c) 
#define u__tolower(c) uni_tolower(c) 
#define u__toupper(c) uni_toupper(c) 


@* String functions.

@ 

|Uchar| version of index() and rindex().

@c
#define uni_index(str, c) (uni_strchr((str), (c)))
#define u_index(str, c) (uni_index((str), (c)))

#define uni_rindex(str, c) (uni_strrchr((str), (c)))
#define u_rindex(str, c) (uni_rindex((str), (c)))
@ 

|Uchar| version of strcasecmp() and strncasecmp().

@c

UNI_DL_IMPORT int uni_strcasecmp(Uchar*, Uchar*);
#define u_strcasecmp(s1, s2) (uni_strcasecmp((s1), (s2)))

UNI_DL_IMPORT int uni_strncasecmp(Uchar*, Uchar*, size_t);
#define u_strncasecmp(s1, s2, n) (uni_strncasecmp((s1), (s2), (n)))


@ 

Append second argument to the first.  This function allocates any
extra memory needed for you.


@c

UNI_DL_IMPORT Uchar *uni_strcat(Ufunctions*, Uchar*, const Uchar*);
UNI_DL_IMPORT Uchar *u_strcat(Uchar*, const Uchar*);

UNI_DL_IMPORT Uchar *uni_strncat(Ufunctions*, Uchar*, const Uchar*, size_t);
UNI_DL_IMPORT Uchar *u_strncat(Uchar*, const Uchar*, size_t);

UNI_DL_IMPORT Uchar *uni_strappend(Ufunctions*, Uchar**, const Uchar*);
UNI_DL_IMPORT Uchar *u_strappend(Uchar**, const Uchar*);

UNI_DL_IMPORT Uchar *uni_strnappend(Ufunctions*, Uchar**, const Uchar*, size_t);
UNI_DL_IMPORT Uchar *u_strnappend(Uchar**, const Uchar*, size_t);

@ 

|Uchar| version of strchr() and strrchr().

@c

UNI_DL_IMPORT Uchar* uni_strchr(const Uchar*, Uchar);
#define u_strchr(str, c) (uni_strchr((str), (c)))

UNI_DL_IMPORT Uchar* uni_strrchr(const Uchar*, Uchar);
#define u_strrchr(str, c) (uni_strrchr((str), (c)))

@ 

Compare two unicode strings.

@c

UNI_DL_IMPORT int uni_strcmp(Uchar*, Uchar*);
#define u_strcmp(s1, s2) (uni_strcmp((s1), (s1)))

UNI_DL_IMPORT int uni_strncmp(Uchar*, Uchar*, size_t);
#define u_strncmp(s1, s2, n) (uni_strncmp((s1), (s1), (n)))

@ 

|Uchar| versions of strspn() and strcspn().

@c

UNI_DL_IMPORT size_t uni_strspn(Uchar*, const Uchar*);
#define u_strspn(s, reject) (uni_strspn((s), (reject)))

UNI_DL_IMPORT size_t uni_strcspn(Uchar*, const Uchar*);
#define u_strcspn(s, accept) (uni_strcspn((s), (accept)))

@ 

|Uchar| version of |strdup()|, and a new one that only copies
a portion of a string.


@c

UNI_DL_IMPORT Uchar* uni_strdup(Ufunctions*, const Uchar*);
UNI_DL_IMPORT Uchar* u_strdup(const Uchar*);

UNI_DL_IMPORT Uchar* uni_strndup(Ufunctions*, const Uchar*, size_t);
UNI_DL_IMPORT Uchar* u_strndup(const Uchar*, size_t);

@

|Uchar*| version of |strcpy()| and |strncpy()|


@c

UNI_DL_IMPORT Uchar* uni_strcpy(Uchar*, const Uchar*);
UNI_DL_IMPORT Uchar* u_strcpy(Uchar*, const Uchar*);

UNI_DL_IMPORT Uchar* uni_strncpy(Uchar*, const Uchar*, size_t);
UNI_DL_IMPORT Uchar* u_strncpy(Uchar*, const Uchar*, size_t);

@ 

 Return the number of 16 bit bytes in |string|.
 

@c

UNI_DL_IMPORT size_t uni_strlen (const Uchar*);
#define u_strlen(str) (uni_strlen((str)))

@ 

|Uchar| version of strpbrk().

@c

UNI_DL_IMPORT Uchar* uni_strpbrk(const Uchar*, const Uchar*);
#define u_strpbrk(str, accept) (uni_strpbrk((str), (accept)))

@ 

|Uchar| version of strsep().

@c

UNI_DL_IMPORT Uchar* uni_strsep(Ufunctions*, Uchar**, Uchar*);
#define u_strsep(str, delim) (uni_strsep(&__ufunctions, (str), (delim)))

@ 

|Uchar| version of strstr().

@c

UNI_DL_IMPORT Uchar* uni_strstr(Uchar*, Uchar*);
#define u_strstr(haystack, needle) (uni_strstr((haystack), (needle)))

@

|Uchar| version of strtok().

@c

UNI_DL_IMPORT Uchar* uni_strtok(Uchar *str, const Uchar *delim);
#define u_strtok(str, delim) (uni_strtok((str), (delim)))


@* Conversion functions.

Convert UTF-8 string to |Uchar*|.

@c

UNI_DL_IMPORT Uchar* uni_utf8_to_uc(Ufunctions*, char*);

#define u_utf8_to_uc(str) (uni_utf8_to_uc(&__ufunctions, (str)))

@ 

Convert |Uchar| string to UTF-8.

@c

UNI_DL_IMPORT char* uni_uc_to_utf8(Ufunctions*, Uchar*);
#define u_uc_to_utf8(str) (uni_uc_to_utf8(&__ufunctions, (str)))

@* Characters.

These are some usefull character definitions.  

Usefull strings from XML.

@c

#ifdef _UNICODE_MAIN_

UNI_DL_IMPORT Uchar U_END_ELEM_NOCONT[] = { 0xFEFF,  ' ', '/', '>', '\0' };
UNI_DL_IMPORT Uchar U_END_ELEM[] = { 0xFEFF,  '<', '/', '\0' };
UNI_DL_IMPORT Uchar U_EQ_QUOT[] = { 0xFEFF,  '=', '\"', '\0' };
UNI_DL_IMPORT Uchar U_CDATA[] = {0xFEFF,   '<', '!', '[', 'C', 'D', 'A', 'T', 'A', '[', ' ', 
			'\0' };
UNI_DL_IMPORT Uchar U_END_CDATA[] = { 0xFEFF,  ']', ']', '>', '\0' };
UNI_DL_IMPORT Uchar U_ENT[] = { 0xFEFF,  '<', '!', 'E', 'N', 'T', 'I', 'T', 'Y', ' ', '\0' };
UNI_DL_IMPORT Uchar U_PI[] = {0xFEFF,   '<', '?', '\0' };
UNI_DL_IMPORT Uchar U_END_PI[] = { 0xFEFF,  '?', '>', '\0' };
UNI_DL_IMPORT Uchar U_COMM[] = {0xFEFF,   '<', '!', '-', '-', '\0' };
UNI_DL_IMPORT Uchar U_END_COMM[] = { 0xFEFF,  '-', '-', '>', '\0' };
UNI_DL_IMPORT Uchar U_DOCTYPE[] = { 0xFEFF,  '<', '!', 'D', 'O', 'C', 'T', 'Y', 'P', 'E', ' ', 
				'\0' };
UNI_DL_IMPORT Uchar U_NOTATION[] = {0xFEFF,   '<', '!', 'N', 'O', 'T', 'A', 'T', 'I', 'O', 'N',
				' ', '\0'};
UNI_DL_IMPORT Uchar U_DEFAULT_PROLOG[] = {0xFEFF, '<','?','x','m','l',' ',
	'v','e','r','s','i','o','n','=','"','1','.','0','"',' ',
  'e','n','c','o','d','i','n','g','=','"','u','t','f','-','8','"',' ','?','>','\0'};

#else

UNI_DL_IMPORT extern Uchar U_END_ELEM_NOCONT[5];
UNI_DL_IMPORT extern Uchar U_END_ELEM[4];
UNI_DL_IMPORT extern Uchar U_EQ_QUOT[4];
UNI_DL_IMPORT extern Uchar U_CDATA[12];
UNI_DL_IMPORT extern Uchar U_END_CDATA[5];
UNI_DL_IMPORT extern Uchar U_ENT[11];
UNI_DL_IMPORT extern Uchar U_PI[4];
UNI_DL_IMPORT extern Uchar U_END_PI[4];
UNI_DL_IMPORT extern Uchar U_COMM[6];
UNI_DL_IMPORT extern Uchar U_END_COMM[5];
UNI_DL_IMPORT extern Uchar U_DOCTYPE[12];
UNI_DL_IMPORT extern Uchar U_NOTATION[13];
UNI_DL_IMPORT extern Uchar U_DEFAULT_PROLOG[41];

#endif /* \_UNICODE\_MAIN\_ */

@* ASCII.

US-ASCII character set.

@c

#ifdef _UNICODE_MAIN_

UNI_DL_IMPORT Uchar U_SP[]		=	{0xFEFF, 0x20, 0};
UNI_DL_IMPORT Uchar U_QUOT[] 	=	{0xFEFF, 0x22, 0};
UNI_DL_IMPORT Uchar U_AMP[]		=	{0xFEFF, 0x26, 0};
UNI_DL_IMPORT Uchar U_SEMICOL[] 	=	{0xFEFF, 0x3B, 0};
UNI_DL_IMPORT Uchar U_LT[] 		=	{0xFEFF, 0x3C, 0};
UNI_DL_IMPORT Uchar U_GT[] 		=	{0xFEFF, 0x3E, 0};

#else

UNI_DL_IMPORT extern Uchar U_SP[3];
UNI_DL_IMPORT extern Uchar U_QUOT[3];
UNI_DL_IMPORT extern Uchar U_AMP[3];
UNI_DL_IMPORT extern Uchar U_SEMICOL[3];
UNI_DL_IMPORT extern Uchar U_LT[3];
UNI_DL_IMPORT extern Uchar U_GT[3];

#endif /* \_UNICODE\_MAIN\_ */

#ifdef __cplusplus
}
#endif

#endif /* \_UNICODE\_H\_ */

@* Index.
