diff options
Diffstat (limited to '')
-rw-r--r-- | src/utfstring.cpp | 240 | ||||
-rw-r--r-- | src/utfstring.h | 52 | ||||
-rw-r--r-- | test.utf16 | 1 | ||||
-rw-r--r-- | test.utf16be | 1 | ||||
-rw-r--r-- | test.utf16le | 1 | ||||
-rw-r--r-- | utf16.cpp | 42 |
6 files changed, 319 insertions, 18 deletions
diff --git a/src/utfstring.cpp b/src/utfstring.cpp index bb0a011..7c4ba19 100644 --- a/src/utfstring.cpp +++ b/src/utfstring.cpp | |||
@@ -8,9 +8,13 @@ | |||
8 | #include "bu/utfstring.h" | 8 | #include "bu/utfstring.h" |
9 | 9 | ||
10 | #include "bu/string.h" | 10 | #include "bu/string.h" |
11 | #include "bu/stream.h" | ||
11 | 12 | ||
12 | #include <endian.h> | 13 | #include <endian.h> |
13 | 14 | ||
15 | #include "bu/sio.h" | ||
16 | using Bu::sio; | ||
17 | |||
14 | Bu::UtfString::UtfString() | 18 | Bu::UtfString::UtfString() |
15 | { | 19 | { |
16 | } | 20 | } |
@@ -33,20 +37,35 @@ void Bu::UtfString::set( const Bu::String &sInput, Encoding eEnc ) | |||
33 | break; | 37 | break; |
34 | 38 | ||
35 | case Utf16: | 39 | case Utf16: |
36 | case Utf16be: | ||
37 | setUtf16( sInput ); | 40 | setUtf16( sInput ); |
38 | break; | 41 | break; |
39 | 42 | ||
43 | case Utf16be: | ||
44 | setUtf16be( sInput ); | ||
45 | break; | ||
46 | |||
40 | case Utf16le: | 47 | case Utf16le: |
41 | throw Bu::ExceptionBase("Utf16le not supported yet."); | 48 | setUtf16le( sInput ); |
42 | break; | 49 | break; |
43 | 50 | ||
44 | case Utf32: | 51 | case Utf32: |
45 | throw Bu::ExceptionBase("Utf32 not supported yet."); | 52 | setUtf32( sInput ); |
53 | break; | ||
54 | |||
55 | case Utf32be: | ||
56 | setUtf32be( sInput ); | ||
57 | break; | ||
58 | |||
59 | case Utf32le: | ||
60 | setUtf32le( sInput ); | ||
61 | break; | ||
62 | |||
63 | case Ucs2: | ||
64 | throw Bu::ExceptionBase("Ucs2 not supported yet."); | ||
46 | break; | 65 | break; |
47 | 66 | ||
48 | case Ucs16: | 67 | case Ucs4: |
49 | throw Bu::ExceptionBase("Ucs16 not supported yet."); | 68 | throw Bu::ExceptionBase("Ucs4 not supported yet."); |
50 | break; | 69 | break; |
51 | 70 | ||
52 | case GuessEncoding: | 71 | case GuessEncoding: |
@@ -104,8 +123,32 @@ void Bu::UtfString::setUtf8( const Bu::String &sInput ) | |||
104 | 123 | ||
105 | void Bu::UtfString::setUtf16( const Bu::String &sInput ) | 124 | void Bu::UtfString::setUtf16( const Bu::String &sInput ) |
106 | { | 125 | { |
126 | Bu::String::const_iterator i = sInput.begin(); | ||
127 | if( (uint8_t)*sInput.begin() == 0xFF && | ||
128 | (uint8_t)*(sInput.begin()+1) == 0xFE ) | ||
129 | { | ||
130 | setUtf16le( sInput ); | ||
131 | return; | ||
132 | } | ||
133 | setUtf16be( sInput ); | ||
134 | } | ||
135 | |||
136 | void Bu::UtfString::setUtf16be( const Bu::String &sInput ) | ||
137 | { | ||
138 | Bu::String::const_iterator i = sInput.begin(); | ||
139 | if( (uint8_t)*sInput.begin() == 0xFE && | ||
140 | (uint8_t)*(sInput.begin()+1) == 0xFF ) | ||
141 | |||
142 | { | ||
143 | i += 2; | ||
144 | sio << "Verified big endian." << sio.nl; | ||
145 | } | ||
146 | else | ||
147 | { | ||
148 | sio << "Assuming big endian." << sio.nl; | ||
149 | } | ||
107 | uint16_t hi, lo; | 150 | uint16_t hi, lo; |
108 | for( Bu::String::const_iterator i = sInput.begin(); i; i++ ) | 151 | for( ; i; i++ ) |
109 | { | 152 | { |
110 | hi = (((uint8_t)*i)<<8) | ((uint8_t)*(++i)); | 153 | hi = (((uint8_t)*i)<<8) | ((uint8_t)*(++i)); |
111 | append16( hi ); | 154 | append16( hi ); |
@@ -117,25 +160,192 @@ void Bu::UtfString::setUtf16( const Bu::String &sInput ) | |||
117 | } | 160 | } |
118 | } | 161 | } |
119 | 162 | ||
120 | #include "bu/sio.h" | 163 | void Bu::UtfString::setUtf16le( const Bu::String &sInput ) |
121 | using Bu::sio; | 164 | { |
165 | Bu::String::const_iterator i = sInput.begin(); | ||
166 | if( (uint8_t)*sInput.begin() == 0xFF && | ||
167 | (uint8_t)*(sInput.begin()+1) == 0xFE ) | ||
168 | { | ||
169 | i += 2; | ||
170 | sio << "Verified little endian." << sio.nl; | ||
171 | } | ||
172 | else | ||
173 | { | ||
174 | sio << "Assuming little endian." << sio.nl; | ||
175 | } | ||
176 | uint16_t hi, lo; | ||
177 | for( ; i; i++ ) | ||
178 | { | ||
179 | hi = (((uint8_t)*i)) | ((uint8_t)*(++i)<<8); | ||
180 | append16( hi ); | ||
181 | if( (hi&0xD800u) == 0xD800u ) | ||
182 | { | ||
183 | lo = (((uint8_t)*(++i))) | ((uint8_t)*(++i)<<8); | ||
184 | append16( lo ); | ||
185 | } | ||
186 | } | ||
187 | } | ||
188 | |||
189 | void Bu::UtfString::setUtf32( const Bu::String &sInput ) | ||
190 | { | ||
191 | Bu::String::const_iterator i = sInput.begin(); | ||
192 | if( (uint8_t)*i == 0x00 && | ||
193 | (uint8_t)*(++i) == 0x00 && | ||
194 | (uint8_t)*(++i) == 0xFF && | ||
195 | (uint8_t)*(++i) == 0xFE ) | ||
196 | { | ||
197 | setUtf32le( sInput ); | ||
198 | return; | ||
199 | } | ||
200 | setUtf32be( sInput ); | ||
201 | } | ||
202 | |||
203 | void Bu::UtfString::setUtf32be( const Bu::String &sInput ) | ||
204 | { | ||
205 | Bu::String::const_iterator i = sInput.begin(); | ||
206 | if( (uint8_t)*i == 0x00 && | ||
207 | (uint8_t)*(++i) == 0x00 && | ||
208 | (uint8_t)*(++i) == 0xFE && | ||
209 | (uint8_t)*(++i) == 0xFF ) | ||
210 | { | ||
211 | i++; | ||
212 | sio << "Verified big endian." << sio.nl; | ||
213 | } | ||
214 | else | ||
215 | { | ||
216 | i = sInput.begin(); | ||
217 | sio << "Assuming big endian." << sio.nl; | ||
218 | } | ||
219 | for( ; i; i++ ) | ||
220 | { | ||
221 | append( (((uint8_t)*i)<<24) | | ||
222 | (((uint8_t)*(++i))<<16) | | ||
223 | (((uint8_t)*(++i))<<8) | | ||
224 | ((uint8_t)*(++i)) | ||
225 | ); | ||
226 | } | ||
227 | } | ||
228 | |||
229 | void Bu::UtfString::setUtf32le( const Bu::String &sInput ) | ||
230 | { | ||
231 | Bu::String::const_iterator i = sInput.begin(); | ||
232 | if( (uint8_t)*i == 0x00 && | ||
233 | (uint8_t)*(++i) == 0x00 && | ||
234 | (uint8_t)*(++i) == 0xFF && | ||
235 | (uint8_t)*(++i) == 0xFE ) | ||
236 | { | ||
237 | i++; | ||
238 | sio << "Verified little endian." << sio.nl; | ||
239 | } | ||
240 | else | ||
241 | { | ||
242 | i = sInput.begin(); | ||
243 | sio << "Assuming little endian." << sio.nl; | ||
244 | } | ||
245 | for( ; i; i++ ) | ||
246 | { | ||
247 | append( ((uint8_t)*i) | | ||
248 | (((uint8_t)*(++i))<<8) | | ||
249 | (((uint8_t)*(++i))<<16) | | ||
250 | (((uint8_t)*(++i))<<24) | ||
251 | ); | ||
252 | } | ||
253 | } | ||
254 | |||
255 | void Bu::UtfString::write( Bu::Stream &sOut, Encoding eEnc ) | ||
256 | { | ||
257 | switch( eEnc ) | ||
258 | { | ||
259 | case Utf8: | ||
260 | writeUtf8( sOut ); | ||
261 | break; | ||
262 | |||
263 | case Utf16: | ||
264 | writeUtf16( sOut ); | ||
265 | break; | ||
266 | |||
267 | case Utf16be: | ||
268 | writeUtf16be( sOut ); | ||
269 | break; | ||
270 | |||
271 | case Utf16le: | ||
272 | writeUtf16le( sOut ); | ||
273 | break; | ||
274 | |||
275 | case Utf32: | ||
276 | writeUtf32( sOut ); | ||
277 | break; | ||
278 | |||
279 | case Utf32be: | ||
280 | writeUtf32be( sOut ); | ||
281 | break; | ||
282 | |||
283 | case Utf32le: | ||
284 | writeUtf32le( sOut ); | ||
285 | break; | ||
286 | |||
287 | case Ucs2: | ||
288 | throw Bu::ExceptionBase("Ucs2 not supported yet."); | ||
289 | break; | ||
290 | |||
291 | case Ucs4: | ||
292 | throw Bu::ExceptionBase("Ucs4 not supported yet."); | ||
293 | break; | ||
294 | |||
295 | case GuessEncoding: | ||
296 | throw Bu::ExceptionBase( | ||
297 | "GuessEncoding is incompatible with encoding."); | ||
298 | break; | ||
299 | |||
300 | } | ||
301 | } | ||
302 | |||
303 | void Bu::UtfString::writeUtf8( Bu::Stream &sOut ) | ||
304 | { | ||
305 | } | ||
306 | |||
307 | void Bu::UtfString::writeUtf16( Bu::Stream &sOut ) | ||
308 | { | ||
309 | } | ||
310 | |||
311 | void Bu::UtfString::writeUtf16be( Bu::Stream &sOut ) | ||
312 | { | ||
313 | } | ||
314 | |||
315 | void Bu::UtfString::writeUtf16le( Bu::Stream &sOut ) | ||
316 | { | ||
317 | } | ||
318 | |||
319 | void Bu::UtfString::writeUtf32( Bu::Stream &sOut ) | ||
320 | { | ||
321 | } | ||
322 | |||
323 | void Bu::UtfString::writeUtf32be( Bu::Stream &sOut ) | ||
324 | { | ||
325 | } | ||
326 | |||
327 | void Bu::UtfString::writeUtf32le( Bu::Stream &sOut ) | ||
328 | { | ||
329 | } | ||
122 | 330 | ||
123 | Bu::UtfChar Bu::UtfString::get( int iIndex ) | 331 | Bu::UtfChar Bu::UtfString::get( int iIndex ) |
124 | { | 332 | { |
125 | Bu::UtfChar i = aData[iIndex]; | 333 | return nextChar( iIndex ); |
334 | } | ||
335 | |||
336 | Bu::UtfChar Bu::UtfString::nextChar( int &iIndex ) | ||
337 | { | ||
338 | Bu::UtfChar i = aData[iIndex++]; | ||
126 | switch( i&0xFC00 ) | 339 | switch( i&0xFC00 ) |
127 | { | 340 | { |
128 | case 0xD800: | 341 | case 0xD800: |
129 | sio << "(hi) "; | 342 | return (((i&0x3FF)<<10) | ((aData[iIndex++]&0x3FF)))+0x10000; |
130 | return (((i&0x3FF)<<10) | ((aData[iIndex+1]&0x3FF)))+0x10000; | ||
131 | 343 | ||
132 | case 0xDC00: | 344 | case 0xDC00: |
133 | sio << "(lo) "; | 345 | return (((aData[iIndex-2]&0x3FF)<<10) | ((i&0x3FF)))+0x10000; |
134 | return 0; | ||
135 | 346 | ||
136 | default: | 347 | default: |
137 | sio << "(--) "; | 348 | return i; |
138 | return i&0xFC00; | ||
139 | } | 349 | } |
140 | } | 350 | } |
141 | 351 | ||
diff --git a/src/utfstring.h b/src/utfstring.h index 79ef62e..8448ea4 100644 --- a/src/utfstring.h +++ b/src/utfstring.h | |||
@@ -14,6 +14,7 @@ | |||
14 | namespace Bu | 14 | namespace Bu |
15 | { | 15 | { |
16 | class String; | 16 | class String; |
17 | class Stream; | ||
17 | 18 | ||
18 | /** | 19 | /** |
19 | * UtfChar isn't actually a character, unicode specifies "code points" not | 20 | * UtfChar isn't actually a character, unicode specifies "code points" not |
@@ -35,7 +36,10 @@ namespace Bu | |||
35 | Utf16be, | 36 | Utf16be, |
36 | Utf16le, | 37 | Utf16le, |
37 | Utf32, | 38 | Utf32, |
38 | Ucs16, | 39 | Utf32be, |
40 | Utf32le, | ||
41 | Ucs2, | ||
42 | Ucs4, | ||
39 | GuessEncoding | 43 | GuessEncoding |
40 | }; | 44 | }; |
41 | 45 | ||
@@ -43,17 +47,59 @@ namespace Bu | |||
43 | UtfString( const Bu::String &sInput, Encoding eEnc=Utf8 ); | 47 | UtfString( const Bu::String &sInput, Encoding eEnc=Utf8 ); |
44 | virtual ~UtfString(); | 48 | virtual ~UtfString(); |
45 | 49 | ||
50 | class iterator | ||
51 | { | ||
52 | private: | ||
53 | iterator( UtfString *pSrc, int iCodePos ) : | ||
54 | pSrc( pSrc ), iCodePos( iCodePos ) | ||
55 | { | ||
56 | } | ||
57 | |||
58 | public: | ||
59 | iterator() : | ||
60 | pSrc( NULL ), iCodePos( 0 ) | ||
61 | { | ||
62 | } | ||
63 | |||
64 | UtfChar operator*() | ||
65 | { | ||
66 | if( !pSrc ) | ||
67 | throw Bu::ExceptionBase("invalid UtfString::iterator dereferenced."); | ||
68 | return pSrc->nextChar( iCodePos ); | ||
69 | } | ||
70 | |||
71 | private: | ||
72 | UtfString *pSrc; | ||
73 | int iCodePos; | ||
74 | }; | ||
75 | |||
46 | void append( UtfChar ch ); | 76 | void append( UtfChar ch ); |
47 | 77 | ||
48 | void set( const Bu::String &sInput, Encoding eEnc=Utf8 ); | 78 | void set( const Bu::String &sInput, Encoding eEnc=Utf8 ); |
49 | void setUtf8( const Bu::String &sInput ); | 79 | void setUtf8( const Bu::String &sInput ); |
50 | void setUtf16( const Bu::String &sInput ); | 80 | void setUtf16( const Bu::String &sInput ); |
51 | // void setUtf16be( const Bu::String &sInput ); | 81 | void setUtf16be( const Bu::String &sInput ); |
52 | // void setUtf16le( const Bu::String &sInput ); | 82 | void setUtf16le( const Bu::String &sInput ); |
83 | void setUtf32( const Bu::String &sInput ); | ||
84 | void setUtf32be( const Bu::String &sInput ); | ||
85 | void setUtf32le( const Bu::String &sInput ); | ||
86 | |||
87 | void write( Bu::Stream &sOut, Encoding eEnc=Utf8 ); | ||
88 | void writeUtf8( Bu::Stream &sOut ); | ||
89 | void writeUtf16( Bu::Stream &sOut ); | ||
90 | void writeUtf16be( Bu::Stream &sOut ); | ||
91 | void writeUtf16le( Bu::Stream &sOut ); | ||
92 | void writeUtf32( Bu::Stream &sOut ); | ||
93 | void writeUtf32be( Bu::Stream &sOut ); | ||
94 | void writeUtf32le( Bu::Stream &sOut ); | ||
95 | |||
96 | Bu::String to( Encoding eEnc=Utf8 ); | ||
97 | Bu::String toUtf8(); | ||
53 | 98 | ||
54 | void debug(); | 99 | void debug(); |
55 | 100 | ||
56 | UtfChar get( int iIndex ); | 101 | UtfChar get( int iIndex ); |
102 | UtfChar nextChar( int &iIndex ); | ||
57 | 103 | ||
58 | private: | 104 | private: |
59 | void append16( uint16_t i ) { aData.append( i ); } | 105 | void append16( uint16_t i ) { aData.append( i ); } |
diff --git a/test.utf16 b/test.utf16 new file mode 100644 index 0000000..86a63c3 --- /dev/null +++ b/test.utf16 | |||
@@ -0,0 +1 @@ | |||
¥Ëæ)˜Ø=Þ<Ûÿßý$H \ No newline at end of file | |||
diff --git a/test.utf16be b/test.utf16be new file mode 100644 index 0000000..136ad1a --- /dev/null +++ b/test.utf16be | |||
@@ -0,0 +1 @@ | |||
þÿ¥Ëæ)˜Ø=Þ<Ûÿßý$H \ No newline at end of file | |||
diff --git a/test.utf16le b/test.utf16le new file mode 100644 index 0000000..9f610d6 --- /dev/null +++ b/test.utf16le | |||
@@ -0,0 +1 @@ | |||
ÿþ¥Ëæ˜)=Ø<ÞÿÛýßH$ \ No newline at end of file | |||
diff --git a/utf16.cpp b/utf16.cpp new file mode 100644 index 0000000..eedb521 --- /dev/null +++ b/utf16.cpp | |||
@@ -0,0 +1,42 @@ | |||
1 | #include <stdio.h> | ||
2 | #include <stdint.h> | ||
3 | |||
4 | void bitprint( uint16_t u ) | ||
5 | { | ||
6 | for( int i = 15; i >= 0; i-- ) | ||
7 | printf("%c", (u&(1<<i))?'1':'0'); | ||
8 | printf("\n"); | ||
9 | } | ||
10 | |||
11 | void bitprint( uint32_t u ) | ||
12 | { | ||
13 | for( int i = 31; i >= 0; i-- ) | ||
14 | printf("%c", (u&(1<<i))?'1':'0'); | ||
15 | printf("\n"); | ||
16 | } | ||
17 | |||
18 | void utoutf16( uint32_t in, uint16_t &outHi, uint16_t &outLo ) | ||
19 | { | ||
20 | outHi = (((in-0x10000)>>10)&0x3FF)| 0xD800u; | ||
21 | outLo = ((in-0x10000)&0x3FF)| 0xDC00u; | ||
22 | printf("0x%X == 0x%X, 0x%X\n", in, outHi, outLo ); | ||
23 | } | ||
24 | |||
25 | int32_t utf16tou( uint16_t hi, uint16_t lo ) | ||
26 | { | ||
27 | return (((uint32_t)hi&0x3FF)<<10 | lo&0x3FF)+0x10000; | ||
28 | } | ||
29 | |||
30 | int main() | ||
31 | { | ||
32 | bitprint( 0xD800u ); | ||
33 | bitprint( 0xDC00u ); | ||
34 | uint16_t hi, lo; | ||
35 | utoutf16( 0x1D11E, hi, lo ); // Cat face with wry smile | ||
36 | utoutf16( 0x10FFFD, hi, lo ); // Cat face with wry smile | ||
37 | utoutf16( 0x1F63C, hi, lo ); // Cat face with wry smile | ||
38 | bitprint( hi ); | ||
39 | bitprint( lo ); | ||
40 | printf("0x%X\n", utf16tou( hi, lo ) ); | ||
41 | return 0; | ||
42 | } | ||