aboutsummaryrefslogtreecommitdiff
path: root/src/unstable/utfstring.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/unstable/utfstring.cpp')
-rw-r--r--src/unstable/utfstring.cpp539
1 files changed, 539 insertions, 0 deletions
diff --git a/src/unstable/utfstring.cpp b/src/unstable/utfstring.cpp
new file mode 100644
index 0000000..19d3ddc
--- /dev/null
+++ b/src/unstable/utfstring.cpp
@@ -0,0 +1,539 @@
1/*
2 * Copyright (C) 2007-2011 Xagasoft, All rights reserved.
3 *
4 * This file is part of the libbu++ library and is released under the
5 * terms of the license contained in the file LICENSE.
6 */
7
8#include "bu/utfstring.h"
9
10#include "bu/string.h"
11#include "bu/stream.h"
12#include "bu/config.h"
13#include "bu/sio.h"
14using Bu::sio;
15
16Bu::UtfString::UtfString()
17{
18}
19
20Bu::UtfString::UtfString( const Bu::String &sInput, Encoding eEnc )
21{
22 set( sInput, eEnc );
23}
24
25Bu::UtfString::~UtfString()
26{
27}
28
29void Bu::UtfString::set( const Bu::String &sInput, Encoding eEnc )
30{
31 switch( eEnc )
32 {
33 case Utf8:
34 setUtf8( sInput );
35 break;
36
37 case Utf16:
38 setUtf16( sInput );
39 break;
40
41 case Utf16be:
42 setUtf16be( sInput );
43 break;
44
45 case Utf16le:
46 setUtf16le( sInput );
47 break;
48
49 case Utf32:
50 setUtf32( sInput );
51 break;
52
53 case Utf32be:
54 setUtf32be( sInput );
55 break;
56
57 case Utf32le:
58 setUtf32le( sInput );
59 break;
60
61 case Ucs2:
62 throw Bu::ExceptionBase("Ucs2 not supported yet.");
63 break;
64
65 case Ucs4:
66 throw Bu::ExceptionBase("Ucs4 not supported yet.");
67 break;
68
69 case GuessEncoding:
70 throw Bu::ExceptionBase("Guessing mode not supported yet.");
71 break;
72 }
73}
74
75void Bu::UtfString::append( UtfChar ch )
76{
77 if( ch >= 0x10000 )
78 {
79 ch -= 0x10000;
80 append16( ((ch>>10)&0x3FF)| 0xD800u );
81 append16( (ch&0x3FF)| 0xDC00u );
82 }
83 else
84 {
85 append16( (uint16_t)(ch) );
86 }
87}
88
89void Bu::UtfString::setUtf8( const Bu::String &sInput )
90{
91 static uint8_t lmask[8] = {
92 0x00,
93 0x01,
94 0x03,
95 0x07,
96 0x0f,
97 0x1f,
98 0x3f,
99 0x7f
100 };
101 for( Bu::String::const_iterator i = sInput.begin(); i; i++ )
102 {
103 if( ((int)(uint8_t)*i)&0x80 )
104 {
105 int iBytes = 1;
106 for(; (((uint8_t)(*i))<<iBytes)&0x80; iBytes++ ) { }
107 Bu::UtfChar uPt = ((*i) & lmask[7-iBytes])<<(6*(iBytes-1));
108 for( iBytes--; iBytes >= 1; iBytes-- )
109 {
110 i++;
111 uPt |= ((*i)&lmask[6])<<(6*(iBytes-1));
112 }
113 append( uPt );
114 }
115 else
116 {
117 append( (Bu::UtfChar)(*i) );
118 }
119 }
120}
121
122void Bu::UtfString::setUtf16( const Bu::String &sInput )
123{
124// Bu::String::const_iterator i = sInput.begin();
125 if( (uint8_t)*sInput.begin() == 0xFF &&
126 (uint8_t)*(sInput.begin()+1) == 0xFE )
127 {
128 setUtf16le( sInput );
129 return;
130 }
131 setUtf16be( sInput );
132}
133
134void Bu::UtfString::setUtf16be( const Bu::String &sInput )
135{
136 Bu::String::const_iterator i = sInput.begin();
137 if( (uint8_t)*sInput.begin() == 0xFE &&
138 (uint8_t)*(sInput.begin()+1) == 0xFF )
139
140 {
141 i += 2;
142 sio << "Verified big endian." << sio.nl;
143 }
144 else
145 {
146 sio << "Assuming big endian." << sio.nl;
147 }
148 uint16_t hi, lo;
149 for( ; i; i++ )
150 {
151 hi = (((uint8_t)*i)<<8) | ((uint8_t)*(++i));
152 append16( hi );
153 if( (hi&0xD800u) == 0xD800u )
154 {
155 lo = (((uint8_t)*(++i))<<8) | ((uint8_t)*(++i));
156 append16( lo );
157 }
158 }
159}
160
161void Bu::UtfString::setUtf16le( const Bu::String &sInput )
162{
163 Bu::String::const_iterator i = sInput.begin();
164 if( (uint8_t)*sInput.begin() == 0xFF &&
165 (uint8_t)*(sInput.begin()+1) == 0xFE )
166 {
167 i += 2;
168 sio << "Verified little endian." << sio.nl;
169 }
170 else
171 {
172 sio << "Assuming little endian." << sio.nl;
173 }
174 uint16_t hi, lo;
175 for( ; i; i++ )
176 {
177 hi = (((uint8_t)*i)) | ((uint8_t)*(++i)<<8);
178 append16( hi );
179 if( (hi&0xD800u) == 0xD800u )
180 {
181 lo = (((uint8_t)*(++i))) | ((uint8_t)*(++i)<<8);
182 append16( lo );
183 }
184 }
185}
186
187void Bu::UtfString::setUtf32( const Bu::String &sInput )
188{
189 Bu::String::const_iterator i = sInput.begin();
190 if( (uint8_t)*i == 0x00 &&
191 (uint8_t)*(++i) == 0x00 &&
192 (uint8_t)*(++i) == 0xFF &&
193 (uint8_t)*(++i) == 0xFE )
194 {
195 setUtf32le( sInput );
196 return;
197 }
198 setUtf32be( sInput );
199}
200
201void Bu::UtfString::setUtf32be( const Bu::String &sInput )
202{
203 Bu::String::const_iterator i = sInput.begin();
204 if( (uint8_t)*i == 0x00 &&
205 (uint8_t)*(++i) == 0x00 &&
206 (uint8_t)*(++i) == 0xFE &&
207 (uint8_t)*(++i) == 0xFF )
208 {
209 i++;
210 sio << "Verified big endian." << sio.nl;
211 }
212 else
213 {
214 i = sInput.begin();
215 sio << "Assuming big endian." << sio.nl;
216 }
217 for( ; i; i++ )
218 {
219 append( (((uint8_t)*i)<<24) |
220 (((uint8_t)*(++i))<<16) |
221 (((uint8_t)*(++i))<<8) |
222 ((uint8_t)*(++i))
223 );
224 }
225}
226
227void Bu::UtfString::setUtf32le( const Bu::String &sInput )
228{
229 Bu::String::const_iterator i = sInput.begin();
230 if( (uint8_t)*i == 0x00 &&
231 (uint8_t)*(++i) == 0x00 &&
232 (uint8_t)*(++i) == 0xFF &&
233 (uint8_t)*(++i) == 0xFE )
234 {
235 i++;
236 sio << "Verified little endian." << sio.nl;
237 }
238 else
239 {
240 i = sInput.begin();
241 sio << "Assuming little endian." << sio.nl;
242 }
243 for( ; i; i++ )
244 {
245 append( ((uint8_t)*i) |
246 (((uint8_t)*(++i))<<8) |
247 (((uint8_t)*(++i))<<16) |
248 (((uint8_t)*(++i))<<24)
249 );
250 }
251}
252
253void Bu::UtfString::write( Bu::Stream &sOut, Encoding eEnc )
254{
255 switch( eEnc )
256 {
257 case Utf8:
258 writeUtf8( sOut );
259 break;
260
261 case Utf16:
262// writeUtf16( sOut );
263// break;
264
265 case Utf16be:
266 writeUtf16be( sOut );
267 break;
268
269 case Utf16le:
270 writeUtf16le( sOut );
271 break;
272
273 case Utf32:
274// writeUtf32( sOut );
275// break;
276
277 case Utf32be:
278 writeUtf32be( sOut );
279 break;
280
281 case Utf32le:
282 writeUtf32le( sOut );
283 break;
284
285 case Ucs2:
286 throw Bu::ExceptionBase("Ucs2 not supported yet.");
287 break;
288
289 case Ucs4:
290 throw Bu::ExceptionBase("Ucs4 not supported yet.");
291 break;
292
293 case GuessEncoding:
294 throw Bu::ExceptionBase(
295 "GuessEncoding is incompatible with encoding.");
296 break;
297
298 }
299}
300
301void Bu::UtfString::writeUtf8( Bu::Stream &sOut )
302{
303 int iPos = 0;
304 while( iPos < aData.getSize() )
305 {
306 uint8_t uByte;
307 Bu::UtfChar chr = nextChar( iPos );
308 if( chr >= 0x010000 )
309 {
310 // Four bytes
311 // 111 111111 111111 111111
312 uByte = (chr>>18)|0xF0;
313 sOut.write( &uByte, 1 );
314 uByte = ((chr>>12)&0x3F)|0x80;
315 sOut.write( &uByte, 1 );
316 uByte = ((chr>>6)&0x3F)|0x80;
317 sOut.write( &uByte, 1 );
318 uByte = (chr&0x3F)|0x80;
319 sOut.write( &uByte, 1 );
320 }
321 else if( chr >= 0x800 )
322 {
323 // Three bytes
324 // 1111 111111 111111
325 uByte = (chr>>12)|0xE0;
326 sOut.write( &uByte, 1 );
327 uByte = ((chr>>6)&0x3F)|0x80;
328 sOut.write( &uByte, 1 );
329 uByte = (chr&0x3F)|0x80;
330 sOut.write( &uByte, 1 );
331 }
332 else if( chr >= 0x80 )
333 {
334 // Two bytes
335 // 11111 111111
336 uByte = (chr>>6)|0xC0;
337 sOut.write( &uByte, 1 );
338 uByte = (chr&0x3F)|0x80;
339 sOut.write( &uByte, 1 );
340 }
341 else
342 {
343 // One byte
344 uByte = chr;
345 sOut.write( &uByte, 1 );
346 }
347 }
348}
349/*
350void Bu::UtfString::writeUtf16( Bu::Stream &sOut )
351{
352}
353*/
354void Bu::UtfString::writeUtf16be( Bu::Stream &sOut )
355{
356#if BYTE_ORDER == BIG_ENDIAN
357 uint16_t iTmp = 0xFEFF; // Byte Order Marker
358 sOut.write( &iTmp, 2 );
359 for( Array<uint16_t>::iterator i = aData.begin(); i; i++ )
360 {
361 iTmp = *i;
362 sOut.write( &iTmp, 2 );
363 }
364#else
365 uint16_t iTmp = 0xFEFF; // Byte Order Marker
366 iTmp = (iTmp>>8) | (iTmp<<8);
367 sOut.write( &iTmp, 2 );
368 for( Array<uint16_t>::iterator i = aData.begin(); i; i++ )
369 {
370 iTmp = *i;
371 iTmp = (iTmp>>8) | (iTmp<<8);
372 sOut.write( &iTmp, 2 );
373 }
374#endif
375}
376
377void Bu::UtfString::writeUtf16le( Bu::Stream &sOut )
378{
379#if BYTE_ORDER == LITTLE_ENDIAN
380 uint16_t iTmp = 0xFEFF; // Byte Order Marker
381 sOut.write( &iTmp, 2 );
382 for( Array<uint16_t>::iterator i = aData.begin(); i; i++ )
383 {
384 iTmp = *i;
385 sOut.write( &iTmp, 2 );
386 }
387#else
388 uint16_t iTmp = 0xFEFF; // Byte Order Marker
389 iTmp = (iTmp>>8) | (iTmp<<8);
390 sOut.write( &iTmp, 2 );
391 for( Array<uint16_t>::iterator i = aData.begin(); i; i++ )
392 {
393 iTmp = *i;
394 iTmp = (iTmp>>8) | (iTmp<<8);
395 sOut.write( &iTmp, 2 );
396 }
397#endif
398}
399
400void Bu::UtfString::writeUtf32be( Bu::Stream &sOut )
401{
402#if BYTE_ORDER == BIG_ENDIAN
403 uint32_t iTmp = 0xFEFF; // Byte Order Marker
404 sOut.write( &iTmp, 4 );
405 int i = 0;
406 while( i < aData.getSize() )
407 {
408 iTmp = nextChar( i );
409 sOut.write( &iTmp, 4 );
410 }
411#else
412 uint32_t iTmp = 0xFEFF; // Byte Order Marker
413 iTmp = (iTmp>>24)|(iTmp<<24)|((iTmp&0xff0000)>>8)|((iTmp&0xff00)<<8);
414 sOut.write( &iTmp, 4 );
415 int i = 0;
416 while( i < aData.getSize() )
417 {
418 iTmp = nextChar( i );
419 iTmp = (iTmp>>24)|(iTmp<<24)|((iTmp&0xff0000)>>8)|((iTmp&0xff00)<<8);
420 sOut.write( &iTmp, 4 );
421 }
422#endif
423}
424
425void Bu::UtfString::writeUtf32le( Bu::Stream &sOut )
426{
427#if BYTE_ORDER == LITTLE_ENDIAN
428 uint32_t iTmp = 0xFEFF; // Byte Order Marker
429 sOut.write( &iTmp, 4 );
430 int i = 0;
431 while( i < aData.getSize() )
432 {
433 iTmp = nextChar( i );
434 sOut.write( &iTmp, 4 );
435 }
436#else
437 uint32_t iTmp = 0xFEFF; // Byte Order Marker
438 iTmp = (iTmp>>24)|(iTmp<<24)|((iTmp&0xff0000)>>8)|((iTmp&0xff00)<<8);
439 sOut.write( &iTmp, 4 );
440 int i = 0;
441 while( i < aData.getSize() )
442 {
443 iTmp = nextChar( i );
444 iTmp = (iTmp>>24)|(iTmp<<24)|((iTmp&0xff0000)>>8)|((iTmp&0xff00)<<8);
445 sOut.write( &iTmp, 4 );
446 }
447#endif
448}
449
450Bu::UtfChar Bu::UtfString::get( int iIndex )
451{
452 return nextChar( iIndex );
453}
454
455Bu::UtfChar Bu::UtfString::nextChar( int &iIndex )
456{
457 Bu::UtfChar i = aData[iIndex++];
458 switch( i&0xFC00 )
459 {
460 case 0xD800:
461 return (((i&0x3FF)<<10) | ((aData[iIndex++]&0x3FF)))+0x10000;
462
463 case 0xDC00:
464 return (((aData[iIndex-2]&0x3FF)<<10) | ((i&0x3FF)))+0x10000;
465
466 default:
467 return i;
468 }
469}
470
471void Bu::UtfString::debug()
472{
473 sio << "Raw Utf16: ";
474 for( int i = 0; i < aData.getSize(); i++ )
475 {
476 if( i > 0 )
477 sio << ", ";
478 sio << "0x" << Fmt::hex() << aData[i];
479 }
480 sio << sio.nl;
481 sio << "Code Points: ";
482 for( int i = 0; i < aData.getSize(); i++ )
483 {
484 if( i > 0 )
485 sio << ", ";
486 sio << "0x" << Fmt::hex() << nextChar( i );
487 }
488 sio << sio.nl;
489}
490/*
491void Bu::UtfString::debugUtf8( const Bu::String &sUtf8 )
492{
493 static uint8_t lmask[8] = {
494 0x00,
495 0x01,
496 0x03,
497 0x07,
498 0x0f,
499 0x1f,
500 0x3f,
501 0x7f
502 };
503 for( Bu::String::const_iterator i = sUtf8.begin(); i; i++ )
504 {
505 if( i != sUtf8.begin() )
506 sio << ", ";
507 if( ((int)(uint8_t)*i)&0x80 )
508 {
509// sio << "Flag byte: " << Bu::Fmt().radix(2).width(8).fill('0')
510// << (int)(uint8_t)*i << sio.nl;
511 int iBytes = 1;
512 for(; (((uint8_t)(*i))<<iBytes)&0x80; iBytes++ ) { }
513// sio << "iBytes = " << iBytes << sio.nl;
514 Bu::UtfChar uPt = ((*i) & lmask[7-iBytes])<<(6*(iBytes-1));
515// sio << "mask: " << Bu::Fmt().radix(2).width(8).fill('0')
516// << (int)lmask[7-iBytes] << sio.nl;
517 for( iBytes--; iBytes >= 1; iBytes-- )
518 {
519// sio << "iBytes = " << iBytes << ", shift = " << (6*(iBytes-1))
520// << sio.nl;
521// sio << "next: " << Bu::Fmt().radix(2).width(8).fill('0')
522// << (int)(uint8_t)*i << sio.nl
523// << "mask: " << Bu::Fmt().radix(2).width(8).fill('0')
524// << (int)lmask[6] << sio.nl;
525 i++;
526 uPt |= ((*i)&lmask[6])<<(6*(iBytes-1));
527 }
528 sio << uPt;
529// sio << " (" << Bu::Fmt( 8, 2 ).fill('0')
530// << uPt << ")";
531 }
532 else
533 {
534 sio << (int)((uint8_t)*i);
535 }
536 }
537 sio << sio.nl;
538}
539*/