"======================================================================
|
|   Base encodings including Unicode (ISO10646)
|
|   $Revision: 1.95.1$
|   $Date: 2000/12/27 10:45:49$
|   $Author: pb$
|
 ======================================================================"


"======================================================================
|
| Copyright 1988-92, 1994-95, 1999, 2000 Free Software Foundation, Inc.
| Written by Paolo Bonzini.
|
| This file is part of the GNU Smalltalk class library.
|
| The GNU Smalltalk class library is free software; you can redistribute it
| and/or modify it under the terms of the GNU Lesser General Public License
| as published by the Free Software Foundation; either version 2.1, or (at
| your option) any later version.
| 
| The GNU Smalltalk class library is distributed in the hope that it will be
| useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
| MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser
| General Public License for more details.
| 
| You should have received a copy of the GNU Lesser General Public License
| along with the GNU Smalltalk class library; see the file COPYING.LESSER.
| If not, write to the Free Software Foundation, 59 Temple Place - Suite
| 330, Boston, MA 02111-1307, USA.  
|
 ======================================================================"

Smalltalk addSubspace: #Charsets!

"These classes are the only ones that stay out of the Charsets namespace."
WordArray variableWordSubclass: #UnicodeSequence
    instanceVariableNames: ''
    classVariableNames: ''
    poolDictionaries: 'Charsets'	"Import everything"
    category: 'i18n-Character sets'!

Stream subclass: #Charset
    instanceVariableNames: 'stream bad'
    classVariableNames: ''
    poolDictionaries: ''
    category: 'i18n-Character sets'!

!CharacterArray methodsFor: 'conversion'!

from: fromCharset to: toCharset bad: badCharacter
    ^(self asUnicode: fromCharset)
	as: toCharset
	bad: badCharacter
!

from: fromCharset to: toCharset
    ^(self asUnicode: fromCharset) as: toCharset
! !

!CharacterArray methodsFor: 'Unicode'!

asUnicode
    ^self asUnicode: ISO88591
!
        
asUnicode: fromCharset
    | unicode decoder |
    unicode := UnicodeSequence new: self size.
    decoder := fromCharset on: self.
    fromCharset isSingleByte
	ifTrue: [
	    1 to: self size do: [ :i | unicode at: i put: decoder next ].
	]
	ifFalse: [
	    unicode := WriteStream on: unicode.
	    decoder do: [ :each | unicode nextPut: decoder next ].
	    unicode := unicode contents
	].
    ^unicode
! !

!UnicodeSequence methodsFor: 'Unicode'!

asUTF8
    ^self asString: UTF8 bad: $?
!

asString: aCharset bad: bad
    | decoder |
    decoder := aCharset on: (String new: self size).
    decoder bad: bad.
    self do: [ :each | decoder nextPut: each ].
    ^decoder contents
!

asString: aCharset
    ^self asString: aStringClass bad: $?
!

asString
    ^self asString: Locale default charset
! !
    
Namespace current: Charsets!

Charset subclass: #SingleByte
    instanceVariableNames: ''
    classVariableNames: 'NullMap'
    poolDictionaries: ''
    category: 'i18n-Character sets'!

SingleByte class
    instanceVariableNames: 'reverseMap'!

Charset subclass: #MultiByte
    instanceVariableNames: ''
    classVariableNames: ''
    poolDictionaries: ''
    category: 'i18n-Character sets'!

MultiByte subclass: #TableEncodedMultiByte
    instanceVariableNames: ''
    classVariableNames: ''
    poolDictionaries: ''
    category: 'i18n-Character sets'!

MultiByte subclass: #UTF8
    instanceVariableNames: ''
    classVariableNames: ''
    poolDictionaries: ''
    category: 'i18n-Character sets'!

!Charset class methodsFor: 'creation'!

on: aStream
    ^self basicNew initialize: aStream
! !

!Charset class methodsFor: 'accessing'!

bad
    ^bad
!

bad: aCharacter
    bad := aCharacter
! !

!Charset class methodsFor: 'creation'!

initialize: aStream
    stream := aStream.
    bad := $?
! !

!Charset class methodsFor: 'testing'!

isSingleByte
    self subclassResponsibility
! !

!SingleByte class methodsFor: 'creation'!

on: aStream
    reverseMap isNil ifTrue: [ self computeReverseMap ].
    ^super on: aStream
! !

!Charset class methodsFor: 'testing'!

isSingleByte
    ^true
! !

!SingleByte class methodsFor: 'Unicode maps'!

reverseMap
    ^reverseMap
!

computeReverseMap
    | i |
    NullMap isNil ifTrue: [ NullMap := Array new: 256 withAll: 0 ].
    reverseMap := Array new: 256 withAll: NullMap.

    i := 255.
    self unicodeMap reverseDo: [ :each || thisMap lo hi |
	each == -1 ifFalse: [
	    lo := each \\ 256 + 1.
	    hi := each // 256 + 1.
	    (thisMap := reverseMap at: hi) == NullMap ifTrue: [
		thisMap := ByteArray new: 256.
		reverseMap at: hi put: thisMap
	    ].
	    thisMap at: lo put: i.
	    i := i - 1
	]
    ]!

unicodeMap
    self subclassResponsibility! !

!SingleByte methodsFor: 'encoding/decoding'!

next
    ^self unicodeMap at: stream next value + 1.
!

nextPut: anInteger
    | c size hi lo mapped |
    c := stream next.
    hi := c bitShift: -8.
    lo := c bitAnd: 63.

    mapped := (self class reverseMap at: hi + 1) at: lo + 1.

    ^(mapped > 0 or: [ hi + lo = 0 ])
	ifFalse: [ bad ]
	ifTrue: [ mapped asCharacter ]
! !

!MultiByte class methodsFor: 'testing'!

isSingleByte
    ^false
! !

!MultiByte methodsFor: 'DBCS support'!

sizeAt: c
    self subclassResponsibility
! !

!TableEncodedMultiByte class methodsFor: 'Unicode'!

nextPut: value
    | indices map c lo mid hi s index tuple |
    indices := self toMapping at: 1.
    map := self toMapping at: 2.

    lo := 1.
    hi := indices size.
    c := #nil.

    "Do a binary search on the indices table"
    [ lo <= hi ] whileTrue: [
	mid := (lo + hi) // 2.
	tuple := indices at: mid.
	value < (tuple at: 1)
	    ifTrue: [ hi := mid - 1 ]
	    ifFalse: [ value > (tuple at: 2)
		ifTrue: [ lo := mid + 1 ]
		ifFalse: [ c := map at: (tuple at: 3) + value. hi := -1 ]
	    ]
    ].

    "Finally, output the Unicode value"
    c == #nil ifTrue: [ c := bad value ].
    c > 16rFF ifTrue: [
	stream nextPut: (c bitShift: -8) asInteger.
	c := c bitAnd: 255
    ].
    stream nextPut: c asCharacter
! !

!TableEncodedMultiByte class methodsFor: 'tables'!

fromMapping
    self subclassResponsibility!

toMapping
    self subclassResponsibility!
    
sizeMap
    ^sizeMap!
    
computeSizeMap
    | map size |
    sizeMap := ByteArray new: 256.
    1 to: 256 do: [ :each |
	size := 1.
	map := (fromMapping at: 2) at: each.
	[ map class == Array ] whileTrue: [
	    size := size + 1.
	    
	    "Find a valid entry in the map (either an integer or an array ]
	    map := map
		detect: [ :each | each ~~ #nil ]
		ifNone: [ #nil ].
	].

	sizeMap at: each put: size.
    ]!

!TableEncodedMultiByte methodsFor: 'DBCS support'!

sizeAt: index
    ^self class sizeMap at: c + 1!

!TableEncodedMultiByte methodsFor: 'Unicode'!

next
    | map c i base firstMap out |
    map := self fromMapping at: 2.
    
    c := stream next.
    map := map at: c + 1.
    [
	map isInteger ifTrue: [ ^map ].
	map == #nil ifTrue: [ ^16rFFFD ]. "<REPLACEMENT CHARACTER>"

	"Else pass through another layer..."
	c := stream next.
	first := map at: 1.
	map := (c >= first and: [ c <= (map at: 3) ])
	    ifFalse: [ #nil ]
	    ifTrue: [ (map at: 2) at: (c - first + 1) ]
    ] repeat!

!UTF8 class methodsFor: 'Unicode'!

charsetName
    ^'UTF-8'! !

!UTF8 methodsFor: 'Unicode'!

nextPut: value
    | c |
    c := value.
    c >= 16r80 ifTrue: [
	c < 16r400
	    ifTrue: [
		"Up to 10 bits: 1100xxxx 10xxxxxx"
		stream nextPut: (16rC0 + (c bitShift: -6)) asCharacter.
	    ]
	    ifFalse: [
		"Up to 16 bits: 1110xxxx 10xxxxxx 10xxxxxx"
		stream nextPut: (16rE0 + (c bitShift: -12)) asCharacter.
		stream nextPut: (16r80 + (c // 64 bitAnd: 63)) asCharacter.
		i := i + 3.
	    ].

	"Store the 6 least significant bits..."
	c := 16r80 + (c bitAnd: 63).
    ].

    stream nextPut: c asCharacter
!

next
    | c s |
    c := stream next asInteger.
    s := self sizeMap at: c + 1.
    s > 1 ifFalse: [ ^c ].

    "Convert a multi-byte UTF8 sequence to 16-bit Unicode"
    c := self firstByteMap at: c + 1.
    s > 2 ifTrue: [ c := c + (stream next asInteger - 128 bitShift: 6) ]
    ^c + (stream next asInteger - 128).
!

sizeAt: index
    ^self sizeMap at: c + 1!

sizeMap
    "Sequences longer than 3-byte are not used by Unicode; values for
     them are included here so that size computations are correct, but
     conversions will fail as soon as the firstByteMap is accessed."

    ^#[ 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
	1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
	1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
	1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
	1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
	1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
	2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
	3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 4 4 4 4 4 4 4 4 5 5 5 5 6 6 6 6]!
    
firstByteMap
    ^#( 16r00 16r01 16r02 16r03 16r04 16r05 16r06 16r07
	16r08 16r09 16r0A 16r0B 16r0C 16r0D 16r0E 16r0F
	16r10 16r11 16r12 16r13 16r14 16r15 16r16 16r17
	16r18 16r19 16r1A 16r1B 16r1C 16r1D 16r1E 16r1F
	16r20 16r21 16r22 16r23 16r24 16r25 16r26 16r27
	16r28 16r29 16r2A 16r2B 16r2C 16r2D 16r2E 16r2F
	16r30 16r31 16r32 16r33 16r34 16r35 16r36 16r37
	16r38 16r39 16r3A 16r3B 16r3C 16r3D 16r3E 16r3F
	16r40 16r41 16r42 16r43 16r44 16r45 16r46 16r47
	16r48 16r49 16r4A 16r4B 16r4C 16r4D 16r4E 16r4F
	16r50 16r51 16r52 16r53 16r54 16r55 16r56 16r57
	16r58 16r59 16r5A 16r5B 16r5C 16r5D 16r5E 16r5F
	16r60 16r61 16r62 16r63 16r64 16r65 16r66 16r67
	16r68 16r69 16r6A 16r6B 16r6C 16r6D 16r6E 16r6F
	16r70 16r71 16r72 16r73 16r74 16r75 16r76 16r77
	16r78 16r79 16r7A 16r7B 16r7C 16r7D 16r7E 16r7F
	16r80 16r81 16r82 16r83 16r84 16r85 16r86 16r87
	16r88 16r89 16r8A 16r8B 16r8C 16r8D 16r8E 16r8F
	16r90 16r91 16r92 16r93 16r94 16r95 16r96 16r97
	16r98 16r99 16r9A 16r9B 16r9C 16r9D 16r9E 16r9F
	16rA0 16rA1 16rA2 16rA3 16rA4 16rA5 16rA6 16rA7
	16rA8 16rA9 16rAA 16rAB 16rAC 16rAD 16rAE 16rAF
	16rB0 16rB1 16rB2 16rB3 16rB4 16rB5 16rB6 16rB7
	16rB8 16rB9 16rBA 16rBB 16rBC 16rBD 16rBE 16rBF

	"2-byte sequences"
	16r000 16r040 16r080 16r0C0 16r100 16r140 16r180 16r1C0
	16r200 16r240 16r280 16r2C0 16r300 16r340 16r380 16r3C0
	16r400 16r440 16r480 16r4C0 16r500 16r540 16r580 16r5C0
	16r600 16r640 16r680 16r6C0 16r700 16r740 16r780 16r7C0

	"3-byte sequences"
	16r0000 16r1000 16r2000 16r3000 16r4000 16r5000 16r6000 16r7000 
	16r8000 16r9000 16rA000 16rB000 16rC000 16rD000 16rE000 16rF000)! !

Namespace current: Smalltalk!

