diff options
author | Mike Buland <eichlan@xagasoft.com> | 2011-04-04 07:22:10 +0000 |
---|---|---|
committer | Mike Buland <eichlan@xagasoft.com> | 2011-04-04 07:22:10 +0000 |
commit | abbf45c1da7f3e3a542e6c6339a1bab31283f22e (patch) | |
tree | 1d40f79bbe315294507bb9bfedfbe2b01e815c1a /src | |
parent | bc5fc82538f220f62f231d5bdda5910752156a32 (diff) | |
download | libbu++-abbf45c1da7f3e3a542e6c6339a1bab31283f22e.tar.gz libbu++-abbf45c1da7f3e3a542e6c6339a1bab31283f22e.tar.bz2 libbu++-abbf45c1da7f3e3a542e6c6339a1bab31283f22e.tar.xz libbu++-abbf45c1da7f3e3a542e6c6339a1bab31283f22e.zip |
I made some awesome progress on the UtfString system, it stores in native utf16
encoding to make things easier (little endian in our case). It can currently
read utf8 and utf16be, but not BOM. It will give you full unicode code points
instead of the raw utf16 values, which is pretty slick.
Diffstat (limited to 'src')
-rw-r--r-- | src/config.h | 3 | ||||
-rw-r--r-- | src/tests/utf.cpp | 3 | ||||
-rw-r--r-- | src/utfstring.cpp | 143 | ||||
-rw-r--r-- | src/utfstring.h | 20 |
4 files changed, 164 insertions, 5 deletions
diff --git a/src/config.h b/src/config.h index 3046b59..ce954de 100644 --- a/src/config.h +++ b/src/config.h | |||
@@ -17,4 +17,7 @@ | |||
17 | 17 | ||
18 | #include "bu/extratypes.h" | 18 | #include "bu/extratypes.h" |
19 | 19 | ||
20 | // Later if we need autoconfig stuff, here's where it'll go. | ||
21 | // #include "bu/autoconfig.h" | ||
22 | |||
20 | #endif | 23 | #endif |
diff --git a/src/tests/utf.cpp b/src/tests/utf.cpp index 59d49c6..9e075e2 100644 --- a/src/tests/utf.cpp +++ b/src/tests/utf.cpp | |||
@@ -16,7 +16,8 @@ int main( int argc, char *argv[] ) | |||
16 | int iAmnt = fIn.read( buf, 4096 ); | 16 | int iAmnt = fIn.read( buf, 4096 ); |
17 | sUtf8.append( buf, iAmnt ); | 17 | sUtf8.append( buf, iAmnt ); |
18 | } | 18 | } |
19 | Bu::UtfString::debugUtf8( sUtf8 ); | 19 | Bu::UtfString us( sUtf8, Bu::UtfString::Utf16 ); |
20 | us.debug(); | ||
20 | } | 21 | } |
21 | } | 22 | } |
22 | 23 | ||
diff --git a/src/utfstring.cpp b/src/utfstring.cpp index 0e2060b..bb0a011 100644 --- a/src/utfstring.cpp +++ b/src/utfstring.cpp | |||
@@ -9,17 +9,156 @@ | |||
9 | 9 | ||
10 | #include "bu/string.h" | 10 | #include "bu/string.h" |
11 | 11 | ||
12 | #include <endian.h> | ||
13 | |||
12 | Bu::UtfString::UtfString() | 14 | Bu::UtfString::UtfString() |
13 | { | 15 | { |
14 | } | 16 | } |
15 | 17 | ||
18 | Bu::UtfString::UtfString( const Bu::String &sInput, Encoding eEnc ) | ||
19 | { | ||
20 | set( sInput, eEnc ); | ||
21 | } | ||
22 | |||
16 | Bu::UtfString::~UtfString() | 23 | Bu::UtfString::~UtfString() |
17 | { | 24 | { |
18 | } | 25 | } |
19 | 26 | ||
27 | void Bu::UtfString::set( const Bu::String &sInput, Encoding eEnc ) | ||
28 | { | ||
29 | switch( eEnc ) | ||
30 | { | ||
31 | case Utf8: | ||
32 | setUtf8( sInput ); | ||
33 | break; | ||
34 | |||
35 | case Utf16: | ||
36 | case Utf16be: | ||
37 | setUtf16( sInput ); | ||
38 | break; | ||
39 | |||
40 | case Utf16le: | ||
41 | throw Bu::ExceptionBase("Utf16le not supported yet."); | ||
42 | break; | ||
43 | |||
44 | case Utf32: | ||
45 | throw Bu::ExceptionBase("Utf32 not supported yet."); | ||
46 | break; | ||
47 | |||
48 | case Ucs16: | ||
49 | throw Bu::ExceptionBase("Ucs16 not supported yet."); | ||
50 | break; | ||
51 | |||
52 | case GuessEncoding: | ||
53 | throw Bu::ExceptionBase("Guessing mode not supported yet."); | ||
54 | break; | ||
55 | } | ||
56 | } | ||
57 | |||
58 | void Bu::UtfString::append( UtfChar ch ) | ||
59 | { | ||
60 | if( ch >= 0x10000 ) | ||
61 | { | ||
62 | ch -= 0x10000; | ||
63 | append16( ((ch>>10)&0x3FF)| 0xD800u ); | ||
64 | append16( (ch&0x3FF)| 0xDC00u ); | ||
65 | } | ||
66 | else | ||
67 | { | ||
68 | append16( (uint16_t)(ch) ); | ||
69 | } | ||
70 | } | ||
71 | |||
72 | void Bu::UtfString::setUtf8( const Bu::String &sInput ) | ||
73 | { | ||
74 | static uint8_t lmask[8] = { | ||
75 | 0x00, | ||
76 | 0x01, | ||
77 | 0x03, | ||
78 | 0x07, | ||
79 | 0x0f, | ||
80 | 0x1f, | ||
81 | 0x3f, | ||
82 | 0x7f | ||
83 | }; | ||
84 | for( Bu::String::const_iterator i = sInput.begin(); i; i++ ) | ||
85 | { | ||
86 | if( ((int)(uint8_t)*i)&0x80 ) | ||
87 | { | ||
88 | int iBytes = 1; | ||
89 | for(; (((uint8_t)(*i))<<iBytes)&0x80; iBytes++ ) { } | ||
90 | Bu::UtfChar uPt = ((*i) & lmask[7-iBytes])<<(6*(iBytes-1)); | ||
91 | for( iBytes--; iBytes >= 1; iBytes-- ) | ||
92 | { | ||
93 | i++; | ||
94 | uPt |= ((*i)&lmask[6])<<(6*(iBytes-1)); | ||
95 | } | ||
96 | append( uPt ); | ||
97 | } | ||
98 | else | ||
99 | { | ||
100 | append( (Bu::UtfChar)(*i) ); | ||
101 | } | ||
102 | } | ||
103 | } | ||
104 | |||
105 | void Bu::UtfString::setUtf16( const Bu::String &sInput ) | ||
106 | { | ||
107 | uint16_t hi, lo; | ||
108 | for( Bu::String::const_iterator i = sInput.begin(); i; i++ ) | ||
109 | { | ||
110 | hi = (((uint8_t)*i)<<8) | ((uint8_t)*(++i)); | ||
111 | append16( hi ); | ||
112 | if( (hi&0xD800u) == 0xD800u ) | ||
113 | { | ||
114 | lo = (((uint8_t)*(++i))<<8) | ((uint8_t)*(++i)); | ||
115 | append16( lo ); | ||
116 | } | ||
117 | } | ||
118 | } | ||
119 | |||
20 | #include "bu/sio.h" | 120 | #include "bu/sio.h" |
21 | using Bu::sio; | 121 | using Bu::sio; |
22 | 122 | ||
123 | Bu::UtfChar Bu::UtfString::get( int iIndex ) | ||
124 | { | ||
125 | Bu::UtfChar i = aData[iIndex]; | ||
126 | switch( i&0xFC00 ) | ||
127 | { | ||
128 | case 0xD800: | ||
129 | sio << "(hi) "; | ||
130 | return (((i&0x3FF)<<10) | ((aData[iIndex+1]&0x3FF)))+0x10000; | ||
131 | |||
132 | case 0xDC00: | ||
133 | sio << "(lo) "; | ||
134 | return 0; | ||
135 | |||
136 | default: | ||
137 | sio << "(--) "; | ||
138 | return i&0xFC00; | ||
139 | } | ||
140 | } | ||
141 | |||
142 | void Bu::UtfString::debug() | ||
143 | { | ||
144 | sio << "Raw Utf16: "; | ||
145 | for( int i = 0; i < aData.getSize(); i++ ) | ||
146 | { | ||
147 | if( i > 0 ) | ||
148 | sio << ", "; | ||
149 | sio << "0x" << Fmt::hex() << aData[i]; | ||
150 | } | ||
151 | sio << sio.nl; | ||
152 | sio << "Code Points: "; | ||
153 | for( int i = 0; i < aData.getSize(); i++ ) | ||
154 | { | ||
155 | if( i > 0 ) | ||
156 | sio << ", "; | ||
157 | sio << "0x" << Fmt::hex() << get( i ); | ||
158 | } | ||
159 | sio << sio.nl; | ||
160 | } | ||
161 | /* | ||
23 | void Bu::UtfString::debugUtf8( const Bu::String &sUtf8 ) | 162 | void Bu::UtfString::debugUtf8( const Bu::String &sUtf8 ) |
24 | { | 163 | { |
25 | static uint8_t lmask[8] = { | 164 | static uint8_t lmask[8] = { |
@@ -43,7 +182,7 @@ void Bu::UtfString::debugUtf8( const Bu::String &sUtf8 ) | |||
43 | int iBytes = 1; | 182 | int iBytes = 1; |
44 | for(; (((uint8_t)(*i))<<iBytes)&0x80; iBytes++ ) { } | 183 | for(; (((uint8_t)(*i))<<iBytes)&0x80; iBytes++ ) { } |
45 | // sio << "iBytes = " << iBytes << sio.nl; | 184 | // sio << "iBytes = " << iBytes << sio.nl; |
46 | point uPt = ((*i) & lmask[7-iBytes])<<(6*(iBytes-1)); | 185 | Bu::UtfChar uPt = ((*i) & lmask[7-iBytes])<<(6*(iBytes-1)); |
47 | // sio << "mask: " << Bu::Fmt().radix(2).width(8).fill('0') | 186 | // sio << "mask: " << Bu::Fmt().radix(2).width(8).fill('0') |
48 | // << (int)lmask[7-iBytes] << sio.nl; | 187 | // << (int)lmask[7-iBytes] << sio.nl; |
49 | for( iBytes--; iBytes >= 1; iBytes-- ) | 188 | for( iBytes--; iBytes >= 1; iBytes-- ) |
@@ -68,4 +207,4 @@ void Bu::UtfString::debugUtf8( const Bu::String &sUtf8 ) | |||
68 | } | 207 | } |
69 | sio << sio.nl; | 208 | sio << sio.nl; |
70 | } | 209 | } |
71 | 210 | */ | |
diff --git a/src/utfstring.h b/src/utfstring.h index 6f85e93..79ef62e 100644 --- a/src/utfstring.h +++ b/src/utfstring.h | |||
@@ -9,9 +9,12 @@ | |||
9 | #define BU_UTF_STRING_H | 9 | #define BU_UTF_STRING_H |
10 | 10 | ||
11 | #include <stdint.h> | 11 | #include <stdint.h> |
12 | #include "bu/array.h" | ||
12 | 13 | ||
13 | namespace Bu | 14 | namespace Bu |
14 | { | 15 | { |
16 | class String; | ||
17 | |||
15 | /** | 18 | /** |
16 | * UtfChar isn't actually a character, unicode specifies "code points" not | 19 | * UtfChar isn't actually a character, unicode specifies "code points" not |
17 | * characters. The main reason for this is that not all code points define | 20 | * characters. The main reason for this is that not all code points define |
@@ -40,10 +43,23 @@ namespace Bu | |||
40 | UtfString( const Bu::String &sInput, Encoding eEnc=Utf8 ); | 43 | UtfString( const Bu::String &sInput, Encoding eEnc=Utf8 ); |
41 | virtual ~UtfString(); | 44 | virtual ~UtfString(); |
42 | 45 | ||
43 | static void debugUtf8( const Bu::String &sUtf8 ); | 46 | void append( UtfChar ch ); |
47 | |||
48 | void set( const Bu::String &sInput, Encoding eEnc=Utf8 ); | ||
49 | void setUtf8( const Bu::String &sInput ); | ||
50 | void setUtf16( const Bu::String &sInput ); | ||
51 | // void setUtf16be( const Bu::String &sInput ); | ||
52 | // void setUtf16le( const Bu::String &sInput ); | ||
53 | |||
54 | void debug(); | ||
55 | |||
56 | UtfChar get( int iIndex ); | ||
57 | |||
58 | private: | ||
59 | void append16( uint16_t i ) { aData.append( i ); } | ||
44 | 60 | ||
45 | private: | 61 | private: |
46 | uint16_t *pData; | 62 | Bu::Array<uint16_t> aData; |
47 | int iRawLen; | 63 | int iRawLen; |
48 | int iCharLen; | 64 | int iCharLen; |
49 | }; | 65 | }; |