aboutsummaryrefslogtreecommitdiff
path: root/src/unstable/utfstring.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/unstable/utfstring.cpp')
-rw-r--r--src/unstable/utfstring.cpp842
1 files changed, 421 insertions, 421 deletions
diff --git a/src/unstable/utfstring.cpp b/src/unstable/utfstring.cpp
index 9fe2d02..421d5fb 100644
--- a/src/unstable/utfstring.cpp
+++ b/src/unstable/utfstring.cpp
@@ -20,12 +20,12 @@ Bu::UtfString::UtfString()
20 20
21Bu::UtfString::UtfString( const Bu::String &sInput, Encoding eEnc ) 21Bu::UtfString::UtfString( const Bu::String &sInput, Encoding eEnc )
22{ 22{
23 set( sInput, eEnc ); 23 set( sInput, eEnc );
24} 24}
25 25
26Bu::UtfString::UtfString( const char *sInput, Encoding eEnc ) 26Bu::UtfString::UtfString( const char *sInput, Encoding eEnc )
27{ 27{
28 set( sInput, eEnc ); 28 set( sInput, eEnc );
29} 29}
30 30
31Bu::UtfString::~UtfString() 31Bu::UtfString::~UtfString()
@@ -34,340 +34,340 @@ Bu::UtfString::~UtfString()
34 34
35Bu::UtfString::iterator Bu::UtfString::begin() 35Bu::UtfString::iterator Bu::UtfString::begin()
36{ 36{
37 return Bu::UtfString::iterator( this, 0 ); 37 return Bu::UtfString::iterator( this, 0 );
38} 38}
39 39
40Bu::UtfString::const_iterator Bu::UtfString::begin() const 40Bu::UtfString::const_iterator Bu::UtfString::begin() const
41{ 41{
42 return Bu::UtfString::const_iterator( this, 0 ); 42 return Bu::UtfString::const_iterator( this, 0 );
43} 43}
44 44
45void Bu::UtfString::set( const Bu::String &sInput, Encoding eEnc ) 45void Bu::UtfString::set( const Bu::String &sInput, Encoding eEnc )
46{ 46{
47 switch( eEnc ) 47 switch( eEnc )
48 { 48 {
49 case Utf8: 49 case Utf8:
50 setUtf8( sInput ); 50 setUtf8( sInput );
51 break; 51 break;
52 52
53 case Utf16: 53 case Utf16:
54 setUtf16( sInput ); 54 setUtf16( sInput );
55 break; 55 break;
56 56
57 case Utf16be: 57 case Utf16be:
58 setUtf16be( sInput ); 58 setUtf16be( sInput );
59 break; 59 break;
60 60
61 case Utf16le: 61 case Utf16le:
62 setUtf16le( sInput ); 62 setUtf16le( sInput );
63 break; 63 break;
64 64
65 case Utf32: 65 case Utf32:
66 setUtf32( sInput ); 66 setUtf32( sInput );
67 break; 67 break;
68 68
69 case Utf32be: 69 case Utf32be:
70 setUtf32be( sInput ); 70 setUtf32be( sInput );
71 break; 71 break;
72 72
73 case Utf32le: 73 case Utf32le:
74 setUtf32le( sInput ); 74 setUtf32le( sInput );
75 break; 75 break;
76 76
77 case Ucs2: 77 case Ucs2:
78 throw Bu::ExceptionBase("Ucs2 not supported yet."); 78 throw Bu::ExceptionBase("Ucs2 not supported yet.");
79 break; 79 break;
80 80
81 case Ucs4: 81 case Ucs4:
82 throw Bu::ExceptionBase("Ucs4 not supported yet."); 82 throw Bu::ExceptionBase("Ucs4 not supported yet.");
83 break; 83 break;
84 84
85 case GuessEncoding: 85 case GuessEncoding:
86 throw Bu::ExceptionBase("Guessing mode not supported yet."); 86 throw Bu::ExceptionBase("Guessing mode not supported yet.");
87 break; 87 break;
88 } 88 }
89} 89}
90 90
91void Bu::UtfString::append( UtfChar ch ) 91void Bu::UtfString::append( UtfChar ch )
92{ 92{
93 if( ch >= 0x10000 ) 93 if( ch >= 0x10000 )
94 { 94 {
95 ch -= 0x10000; 95 ch -= 0x10000;
96 append16( ((ch>>10)&0x3FF)| 0xD800u ); 96 append16( ((ch>>10)&0x3FF)| 0xD800u );
97 append16( (ch&0x3FF)| 0xDC00u ); 97 append16( (ch&0x3FF)| 0xDC00u );
98 } 98 }
99 else 99 else
100 { 100 {
101 append16( (uint16_t)(ch) ); 101 append16( (uint16_t)(ch) );
102 } 102 }
103} 103}
104 104
105void Bu::UtfString::append( const UtfString &rSrc ) 105void Bu::UtfString::append( const UtfString &rSrc )
106{ 106{
107 aData.append( rSrc.aData ); 107 aData.append( rSrc.aData );
108 iRawLen += rSrc.iRawLen; 108 iRawLen += rSrc.iRawLen;
109 iCharLen += rSrc.iCharLen; 109 iCharLen += rSrc.iCharLen;
110} 110}
111 111
112void Bu::UtfString::setUtf8( const Bu::String &sInput ) 112void Bu::UtfString::setUtf8( const Bu::String &sInput )
113{ 113{
114 static uint8_t lmask[8] = { 114 static uint8_t lmask[8] = {
115 0x00, 115 0x00,
116 0x01, 116 0x01,
117 0x03, 117 0x03,
118 0x07, 118 0x07,
119 0x0f, 119 0x0f,
120 0x1f, 120 0x1f,
121 0x3f, 121 0x3f,
122 0x7f 122 0x7f
123 }; 123 };
124 for( Bu::String::const_iterator i = sInput.begin(); i; i++ ) 124 for( Bu::String::const_iterator i = sInput.begin(); i; i++ )
125 { 125 {
126 if( ((int)(uint8_t)*i)&0x80 ) 126 if( ((int)(uint8_t)*i)&0x80 )
127 { 127 {
128 int iBytes = 1; 128 int iBytes = 1;
129 for(; (((uint8_t)(*i))<<iBytes)&0x80; iBytes++ ) { } 129 for(; (((uint8_t)(*i))<<iBytes)&0x80; iBytes++ ) { }
130 Bu::UtfChar uPt = ((*i) & lmask[7-iBytes])<<(6*(iBytes-1)); 130 Bu::UtfChar uPt = ((*i) & lmask[7-iBytes])<<(6*(iBytes-1));
131 for( iBytes--; iBytes >= 1; iBytes-- ) 131 for( iBytes--; iBytes >= 1; iBytes-- )
132 { 132 {
133 i++; 133 i++;
134 uPt |= ((*i)&lmask[6])<<(6*(iBytes-1)); 134 uPt |= ((*i)&lmask[6])<<(6*(iBytes-1));
135 } 135 }
136 append( uPt ); 136 append( uPt );
137 } 137 }
138 else 138 else
139 { 139 {
140 append( (Bu::UtfChar)(*i) ); 140 append( (Bu::UtfChar)(*i) );
141 } 141 }
142 } 142 }
143} 143}
144 144
145void Bu::UtfString::setUtf16( const Bu::String &sInput ) 145void Bu::UtfString::setUtf16( const Bu::String &sInput )
146{ 146{
147// Bu::String::const_iterator i = sInput.begin(); 147// Bu::String::const_iterator i = sInput.begin();
148 if( (uint8_t)*sInput.begin() == 0xFF && 148 if( (uint8_t)*sInput.begin() == 0xFF &&
149 (uint8_t)*(sInput.begin()+1) == 0xFE ) 149 (uint8_t)*(sInput.begin()+1) == 0xFE )
150 { 150 {
151 setUtf16le( sInput ); 151 setUtf16le( sInput );
152 return; 152 return;
153 } 153 }
154 setUtf16be( sInput ); 154 setUtf16be( sInput );
155} 155}
156 156
157void Bu::UtfString::setUtf16be( const Bu::String &sInput ) 157void Bu::UtfString::setUtf16be( const Bu::String &sInput )
158{ 158{
159 Bu::String::const_iterator i = sInput.begin(); 159 Bu::String::const_iterator i = sInput.begin();
160 if( (uint8_t)*sInput.begin() == 0xFE && 160 if( (uint8_t)*sInput.begin() == 0xFE &&
161 (uint8_t)*(sInput.begin()+1) == 0xFF ) 161 (uint8_t)*(sInput.begin()+1) == 0xFF )
162 162
163 { 163 {
164 i += 2; 164 i += 2;
165 sio << "Verified big endian." << sio.nl; 165 sio << "Verified big endian." << sio.nl;
166 } 166 }
167 else 167 else
168 { 168 {
169 sio << "Assuming big endian." << sio.nl; 169 sio << "Assuming big endian." << sio.nl;
170 } 170 }
171 uint16_t hi, lo; 171 uint16_t hi, lo;
172 for( ; i; i++ ) 172 for( ; i; i++ )
173 { 173 {
174 hi = (((uint8_t)*i)<<8) | ((uint8_t)*(++i)); 174 hi = (((uint8_t)*i)<<8) | ((uint8_t)*(++i));
175 append16( hi ); 175 append16( hi );
176 if( (hi&0xD800u) == 0xD800u ) 176 if( (hi&0xD800u) == 0xD800u )
177 { 177 {
178 lo = (((uint8_t)*(++i))<<8) | ((uint8_t)*(++i)); 178 lo = (((uint8_t)*(++i))<<8) | ((uint8_t)*(++i));
179 append16( lo ); 179 append16( lo );
180 } 180 }
181 } 181 }
182} 182}
183 183
184void Bu::UtfString::setUtf16le( const Bu::String &sInput ) 184void Bu::UtfString::setUtf16le( const Bu::String &sInput )
185{ 185{
186 Bu::String::const_iterator i = sInput.begin(); 186 Bu::String::const_iterator i = sInput.begin();
187 if( (uint8_t)*sInput.begin() == 0xFF && 187 if( (uint8_t)*sInput.begin() == 0xFF &&
188 (uint8_t)*(sInput.begin()+1) == 0xFE ) 188 (uint8_t)*(sInput.begin()+1) == 0xFE )
189 { 189 {
190 i += 2; 190 i += 2;
191 sio << "Verified little endian." << sio.nl; 191 sio << "Verified little endian." << sio.nl;
192 } 192 }
193 else 193 else
194 { 194 {
195 sio << "Assuming little endian." << sio.nl; 195 sio << "Assuming little endian." << sio.nl;
196 } 196 }
197 uint16_t hi, lo; 197 uint16_t hi, lo;
198 for( ; i; i++ ) 198 for( ; i; i++ )
199 { 199 {
200 hi = (((uint8_t)*i)) | ((uint8_t)*(++i)<<8); 200 hi = (((uint8_t)*i)) | ((uint8_t)*(++i)<<8);
201 append16( hi ); 201 append16( hi );
202 if( (hi&0xD800u) == 0xD800u ) 202 if( (hi&0xD800u) == 0xD800u )
203 { 203 {
204 lo = (((uint8_t)*(++i))) | ((uint8_t)*(++i)<<8); 204 lo = (((uint8_t)*(++i))) | ((uint8_t)*(++i)<<8);
205 append16( lo ); 205 append16( lo );
206 } 206 }
207 } 207 }
208} 208}
209 209
210void Bu::UtfString::setUtf32( const Bu::String &sInput ) 210void Bu::UtfString::setUtf32( const Bu::String &sInput )
211{ 211{
212 Bu::String::const_iterator i = sInput.begin(); 212 Bu::String::const_iterator i = sInput.begin();
213 if( (uint8_t)*i == 0x00 && 213 if( (uint8_t)*i == 0x00 &&
214 (uint8_t)*(++i) == 0x00 && 214 (uint8_t)*(++i) == 0x00 &&
215 (uint8_t)*(++i) == 0xFF && 215 (uint8_t)*(++i) == 0xFF &&
216 (uint8_t)*(++i) == 0xFE ) 216 (uint8_t)*(++i) == 0xFE )
217 { 217 {
218 setUtf32le( sInput ); 218 setUtf32le( sInput );
219 return; 219 return;
220 } 220 }
221 setUtf32be( sInput ); 221 setUtf32be( sInput );
222} 222}
223 223
224void Bu::UtfString::setUtf32be( const Bu::String &sInput ) 224void Bu::UtfString::setUtf32be( const Bu::String &sInput )
225{ 225{
226 Bu::String::const_iterator i = sInput.begin(); 226 Bu::String::const_iterator i = sInput.begin();
227 if( (uint8_t)*i == 0x00 && 227 if( (uint8_t)*i == 0x00 &&
228 (uint8_t)*(++i) == 0x00 && 228 (uint8_t)*(++i) == 0x00 &&
229 (uint8_t)*(++i) == 0xFE && 229 (uint8_t)*(++i) == 0xFE &&
230 (uint8_t)*(++i) == 0xFF ) 230 (uint8_t)*(++i) == 0xFF )
231 { 231 {
232 i++; 232 i++;
233 sio << "Verified big endian." << sio.nl; 233 sio << "Verified big endian." << sio.nl;
234 } 234 }
235 else 235 else
236 { 236 {
237 i = sInput.begin(); 237 i = sInput.begin();
238 sio << "Assuming big endian." << sio.nl; 238 sio << "Assuming big endian." << sio.nl;
239 } 239 }
240 for( ; i; i++ ) 240 for( ; i; i++ )
241 { 241 {
242 append( (((uint8_t)*i)<<24) | 242 append( (((uint8_t)*i)<<24) |
243 (((uint8_t)*(++i))<<16) | 243 (((uint8_t)*(++i))<<16) |
244 (((uint8_t)*(++i))<<8) | 244 (((uint8_t)*(++i))<<8) |
245 ((uint8_t)*(++i)) 245 ((uint8_t)*(++i))
246 ); 246 );
247 } 247 }
248} 248}
249 249
250void Bu::UtfString::setUtf32le( const Bu::String &sInput ) 250void Bu::UtfString::setUtf32le( const Bu::String &sInput )
251{ 251{
252 Bu::String::const_iterator i = sInput.begin(); 252 Bu::String::const_iterator i = sInput.begin();
253 if( (uint8_t)*i == 0x00 && 253 if( (uint8_t)*i == 0x00 &&
254 (uint8_t)*(++i) == 0x00 && 254 (uint8_t)*(++i) == 0x00 &&
255 (uint8_t)*(++i) == 0xFF && 255 (uint8_t)*(++i) == 0xFF &&
256 (uint8_t)*(++i) == 0xFE ) 256 (uint8_t)*(++i) == 0xFE )
257 { 257 {
258 i++; 258 i++;
259 sio << "Verified little endian." << sio.nl; 259 sio << "Verified little endian." << sio.nl;
260 } 260 }
261 else 261 else
262 { 262 {
263 i = sInput.begin(); 263 i = sInput.begin();
264 sio << "Assuming little endian." << sio.nl; 264 sio << "Assuming little endian." << sio.nl;
265 } 265 }
266 for( ; i; i++ ) 266 for( ; i; i++ )
267 { 267 {
268 append( ((uint8_t)*i) | 268 append( ((uint8_t)*i) |
269 (((uint8_t)*(++i))<<8) | 269 (((uint8_t)*(++i))<<8) |
270 (((uint8_t)*(++i))<<16) | 270 (((uint8_t)*(++i))<<16) |
271 (((uint8_t)*(++i))<<24) 271 (((uint8_t)*(++i))<<24)
272 ); 272 );
273 } 273 }
274} 274}
275 275
276void Bu::UtfString::write( Bu::Stream &sOut, Encoding eEnc ) const 276void Bu::UtfString::write( Bu::Stream &sOut, Encoding eEnc ) const
277{ 277{
278 switch( eEnc ) 278 switch( eEnc )
279 { 279 {
280 case Utf8: 280 case Utf8:
281 writeUtf8( sOut ); 281 writeUtf8( sOut );
282 break; 282 break;
283 283
284 case Utf16: 284 case Utf16:
285// writeUtf16( sOut ); 285// writeUtf16( sOut );
286// break; 286// break;
287 287
288 case Utf16be: 288 case Utf16be:
289 writeUtf16be( sOut ); 289 writeUtf16be( sOut );
290 break; 290 break;
291 291
292 case Utf16le: 292 case Utf16le:
293 writeUtf16le( sOut ); 293 writeUtf16le( sOut );
294 break; 294 break;
295 295
296 case Utf32: 296 case Utf32:
297// writeUtf32( sOut ); 297// writeUtf32( sOut );
298// break; 298// break;
299 299
300 case Utf32be: 300 case Utf32be:
301 writeUtf32be( sOut ); 301 writeUtf32be( sOut );
302 break; 302 break;
303 303
304 case Utf32le: 304 case Utf32le:
305 writeUtf32le( sOut ); 305 writeUtf32le( sOut );
306 break; 306 break;
307 307
308 case Ucs2: 308 case Ucs2:
309 throw Bu::ExceptionBase("Ucs2 not supported yet."); 309 throw Bu::ExceptionBase("Ucs2 not supported yet.");
310 break; 310 break;
311 311
312 case Ucs4: 312 case Ucs4:
313 throw Bu::ExceptionBase("Ucs4 not supported yet."); 313 throw Bu::ExceptionBase("Ucs4 not supported yet.");
314 break; 314 break;
315 315
316 case GuessEncoding: 316 case GuessEncoding:
317 throw Bu::ExceptionBase( 317 throw Bu::ExceptionBase(
318 "GuessEncoding is incompatible with encoding."); 318 "GuessEncoding is incompatible with encoding.");
319 break; 319 break;
320 320
321 } 321 }
322} 322}
323 323
324void Bu::UtfString::writeUtf8( Bu::Stream &sOut ) const 324void Bu::UtfString::writeUtf8( Bu::Stream &sOut ) const
325{ 325{
326 int iPos = 0; 326 int iPos = 0;
327 while( iPos < aData.getSize() ) 327 while( iPos < aData.getSize() )
328 { 328 {
329 uint8_t uByte; 329 uint8_t uByte;
330 Bu::UtfChar chr = nextChar( iPos ); 330 Bu::UtfChar chr = nextChar( iPos );
331 if( chr >= 0x010000 ) 331 if( chr >= 0x010000 )
332 { 332 {
333 // Four bytes 333 // Four bytes
334 // 111 111111 111111 111111 334 // 111 111111 111111 111111
335 uByte = (chr>>18)|0xF0; 335 uByte = (chr>>18)|0xF0;
336 sOut.write( &uByte, 1 ); 336 sOut.write( &uByte, 1 );
337 uByte = ((chr>>12)&0x3F)|0x80; 337 uByte = ((chr>>12)&0x3F)|0x80;
338 sOut.write( &uByte, 1 ); 338 sOut.write( &uByte, 1 );
339 uByte = ((chr>>6)&0x3F)|0x80; 339 uByte = ((chr>>6)&0x3F)|0x80;
340 sOut.write( &uByte, 1 ); 340 sOut.write( &uByte, 1 );
341 uByte = (chr&0x3F)|0x80; 341 uByte = (chr&0x3F)|0x80;
342 sOut.write( &uByte, 1 ); 342 sOut.write( &uByte, 1 );
343 } 343 }
344 else if( chr >= 0x800 ) 344 else if( chr >= 0x800 )
345 { 345 {
346 // Three bytes 346 // Three bytes
347 // 1111 111111 111111 347 // 1111 111111 111111
348 uByte = (chr>>12)|0xE0; 348 uByte = (chr>>12)|0xE0;
349 sOut.write( &uByte, 1 ); 349 sOut.write( &uByte, 1 );
350 uByte = ((chr>>6)&0x3F)|0x80; 350 uByte = ((chr>>6)&0x3F)|0x80;
351 sOut.write( &uByte, 1 ); 351 sOut.write( &uByte, 1 );
352 uByte = (chr&0x3F)|0x80; 352 uByte = (chr&0x3F)|0x80;
353 sOut.write( &uByte, 1 ); 353 sOut.write( &uByte, 1 );
354 } 354 }
355 else if( chr >= 0x80 ) 355 else if( chr >= 0x80 )
356 { 356 {
357 // Two bytes 357 // Two bytes
358 // 11111 111111 358 // 11111 111111
359 uByte = (chr>>6)|0xC0; 359 uByte = (chr>>6)|0xC0;
360 sOut.write( &uByte, 1 ); 360 sOut.write( &uByte, 1 );
361 uByte = (chr&0x3F)|0x80; 361 uByte = (chr&0x3F)|0x80;
362 sOut.write( &uByte, 1 ); 362 sOut.write( &uByte, 1 );
363 } 363 }
364 else 364 else
365 { 365 {
366 // One byte 366 // One byte
367 uByte = chr; 367 uByte = chr;
368 sOut.write( &uByte, 1 ); 368 sOut.write( &uByte, 1 );
369 } 369 }
370 } 370 }
371} 371}
372/* 372/*
373void Bu::UtfString::writeUtf16( Bu::Stream &sOut ) 373void Bu::UtfString::writeUtf16( Bu::Stream &sOut )
@@ -377,228 +377,228 @@ void Bu::UtfString::writeUtf16( Bu::Stream &sOut )
377void Bu::UtfString::writeUtf16be( Bu::Stream &sOut ) const 377void Bu::UtfString::writeUtf16be( Bu::Stream &sOut ) const
378{ 378{
379#if BYTE_ORDER == BIG_ENDIAN 379#if BYTE_ORDER == BIG_ENDIAN
380 uint16_t iTmp = 0xFEFF; // Byte Order Marker 380 uint16_t iTmp = 0xFEFF; // Byte Order Marker
381 sOut.write( &iTmp, 2 ); 381 sOut.write( &iTmp, 2 );
382 for( Array<uint16_t>::const_iterator i = aData.begin(); i; i++ ) 382 for( Array<uint16_t>::const_iterator i = aData.begin(); i; i++ )
383 { 383 {
384 iTmp = *i; 384 iTmp = *i;
385 sOut.write( &iTmp, 2 ); 385 sOut.write( &iTmp, 2 );
386 } 386 }
387#else 387#else
388 uint16_t iTmp = 0xFEFF; // Byte Order Marker 388 uint16_t iTmp = 0xFEFF; // Byte Order Marker
389 iTmp = (iTmp>>8) | (iTmp<<8); 389 iTmp = (iTmp>>8) | (iTmp<<8);
390 sOut.write( &iTmp, 2 ); 390 sOut.write( &iTmp, 2 );
391 for( Array<uint16_t>::const_iterator i = aData.begin(); i; i++ ) 391 for( Array<uint16_t>::const_iterator i = aData.begin(); i; i++ )
392 { 392 {
393 iTmp = *i; 393 iTmp = *i;
394 iTmp = (iTmp>>8) | (iTmp<<8); 394 iTmp = (iTmp>>8) | (iTmp<<8);
395 sOut.write( &iTmp, 2 ); 395 sOut.write( &iTmp, 2 );
396 } 396 }
397#endif 397#endif
398} 398}
399 399
400void Bu::UtfString::writeUtf16le( Bu::Stream &sOut ) const 400void Bu::UtfString::writeUtf16le( Bu::Stream &sOut ) const
401{ 401{
402#if BYTE_ORDER == LITTLE_ENDIAN 402#if BYTE_ORDER == LITTLE_ENDIAN
403 uint16_t iTmp = 0xFEFF; // Byte Order Marker 403 uint16_t iTmp = 0xFEFF; // Byte Order Marker
404 sOut.write( &iTmp, 2 ); 404 sOut.write( &iTmp, 2 );
405 for( Array<uint16_t>::const_iterator i = aData.begin(); i; i++ ) 405 for( Array<uint16_t>::const_iterator i = aData.begin(); i; i++ )
406 { 406 {
407 iTmp = *i; 407 iTmp = *i;
408 sOut.write( &iTmp, 2 ); 408 sOut.write( &iTmp, 2 );
409 } 409 }
410#else 410#else
411 uint16_t iTmp = 0xFEFF; // Byte Order Marker 411 uint16_t iTmp = 0xFEFF; // Byte Order Marker
412 iTmp = (iTmp>>8) | (iTmp<<8); 412 iTmp = (iTmp>>8) | (iTmp<<8);
413 sOut.write( &iTmp, 2 ); 413 sOut.write( &iTmp, 2 );
414 for( Array<uint16_t>::const_iterator i = aData.begin(); i; i++ ) 414 for( Array<uint16_t>::const_iterator i = aData.begin(); i; i++ )
415 { 415 {
416 iTmp = *i; 416 iTmp = *i;
417 iTmp = (iTmp>>8) | (iTmp<<8); 417 iTmp = (iTmp>>8) | (iTmp<<8);
418 sOut.write( &iTmp, 2 ); 418 sOut.write( &iTmp, 2 );
419 } 419 }
420#endif 420#endif
421} 421}
422 422
423void Bu::UtfString::writeUtf32be( Bu::Stream &sOut ) const 423void Bu::UtfString::writeUtf32be( Bu::Stream &sOut ) const
424{ 424{
425#if BYTE_ORDER == BIG_ENDIAN 425#if BYTE_ORDER == BIG_ENDIAN
426 uint32_t iTmp = 0xFEFF; // Byte Order Marker 426 uint32_t iTmp = 0xFEFF; // Byte Order Marker
427 sOut.write( &iTmp, 4 ); 427 sOut.write( &iTmp, 4 );
428 int i = 0; 428 int i = 0;
429 while( i < aData.getSize() ) 429 while( i < aData.getSize() )
430 { 430 {
431 iTmp = nextChar( i ); 431 iTmp = nextChar( i );
432 sOut.write( &iTmp, 4 ); 432 sOut.write( &iTmp, 4 );
433 } 433 }
434#else 434#else
435 uint32_t iTmp = 0xFEFF; // Byte Order Marker 435 uint32_t iTmp = 0xFEFF; // Byte Order Marker
436 iTmp = (iTmp>>24)|(iTmp<<24)|((iTmp&0xff0000)>>8)|((iTmp&0xff00)<<8); 436 iTmp = (iTmp>>24)|(iTmp<<24)|((iTmp&0xff0000)>>8)|((iTmp&0xff00)<<8);
437 sOut.write( &iTmp, 4 ); 437 sOut.write( &iTmp, 4 );
438 int i = 0; 438 int i = 0;
439 while( i < aData.getSize() ) 439 while( i < aData.getSize() )
440 { 440 {
441 iTmp = nextChar( i ); 441 iTmp = nextChar( i );
442 iTmp = (iTmp>>24)|(iTmp<<24)|((iTmp&0xff0000)>>8)|((iTmp&0xff00)<<8); 442 iTmp = (iTmp>>24)|(iTmp<<24)|((iTmp&0xff0000)>>8)|((iTmp&0xff00)<<8);
443 sOut.write( &iTmp, 4 ); 443 sOut.write( &iTmp, 4 );
444 } 444 }
445#endif 445#endif
446} 446}
447 447
448void Bu::UtfString::writeUtf32le( Bu::Stream &sOut ) const 448void Bu::UtfString::writeUtf32le( Bu::Stream &sOut ) const
449{ 449{
450#if BYTE_ORDER == LITTLE_ENDIAN 450#if BYTE_ORDER == LITTLE_ENDIAN
451 uint32_t iTmp = 0xFEFF; // Byte Order Marker 451 uint32_t iTmp = 0xFEFF; // Byte Order Marker
452 sOut.write( &iTmp, 4 ); 452 sOut.write( &iTmp, 4 );
453 int i = 0; 453 int i = 0;
454 while( i < aData.getSize() ) 454 while( i < aData.getSize() )
455 { 455 {
456 iTmp = nextChar( i ); 456 iTmp = nextChar( i );
457 sOut.write( &iTmp, 4 ); 457 sOut.write( &iTmp, 4 );
458 } 458 }
459#else 459#else
460 uint32_t iTmp = 0xFEFF; // Byte Order Marker 460 uint32_t iTmp = 0xFEFF; // Byte Order Marker
461 iTmp = (iTmp>>24)|(iTmp<<24)|((iTmp&0xff0000)>>8)|((iTmp&0xff00)<<8); 461 iTmp = (iTmp>>24)|(iTmp<<24)|((iTmp&0xff0000)>>8)|((iTmp&0xff00)<<8);
462 sOut.write( &iTmp, 4 ); 462 sOut.write( &iTmp, 4 );
463 int i = 0; 463 int i = 0;
464 while( i < aData.getSize() ) 464 while( i < aData.getSize() )
465 { 465 {
466 iTmp = nextChar( i ); 466 iTmp = nextChar( i );
467 iTmp = (iTmp>>24)|(iTmp<<24)|((iTmp&0xff0000)>>8)|((iTmp&0xff00)<<8); 467 iTmp = (iTmp>>24)|(iTmp<<24)|((iTmp&0xff0000)>>8)|((iTmp&0xff00)<<8);
468 sOut.write( &iTmp, 4 ); 468 sOut.write( &iTmp, 4 );
469 } 469 }
470#endif 470#endif
471} 471}
472 472
473Bu::UtfChar Bu::UtfString::get( int iIndex ) const 473Bu::UtfChar Bu::UtfString::get( int iIndex ) const
474{ 474{
475 return nextChar( iIndex ); 475 return nextChar( iIndex );
476} 476}
477 477
478Bu::UtfChar Bu::UtfString::nextChar( int &iIndex ) const 478Bu::UtfChar Bu::UtfString::nextChar( int &iIndex ) const
479{ 479{
480 Bu::UtfChar i = aData[iIndex++]; 480 Bu::UtfChar i = aData[iIndex++];
481 switch( i&0xFC00 ) 481 switch( i&0xFC00 )
482 { 482 {
483 case 0xD800: 483 case 0xD800:
484 return (((i&0x3FF)<<10) | ((aData[iIndex++]&0x3FF)))+0x10000; 484 return (((i&0x3FF)<<10) | ((aData[iIndex++]&0x3FF)))+0x10000;
485 485
486 case 0xDC00: 486 case 0xDC00:
487 return (((aData[iIndex-2]&0x3FF)<<10) | ((i&0x3FF)))+0x10000; 487 return (((aData[iIndex-2]&0x3FF)<<10) | ((i&0x3FF)))+0x10000;
488 488
489 default: 489 default:
490 return i; 490 return i;
491 } 491 }
492} 492}
493 493
494bool Bu::UtfString::operator==( const Bu::UtfString &rhs ) const 494bool Bu::UtfString::operator==( const Bu::UtfString &rhs ) const
495{ 495{
496 return aData == rhs.aData; 496 return aData == rhs.aData;
497} 497}
498 498
499Bu::UtfString &Bu::UtfString::operator+=( const Bu::UtfString &rhs ) 499Bu::UtfString &Bu::UtfString::operator+=( const Bu::UtfString &rhs )
500{ 500{
501 append( rhs ); 501 append( rhs );
502 return *this; 502 return *this;
503} 503}
504 504
505Bu::UtfString &Bu::UtfString::operator+=( const UtfChar &rhs ) 505Bu::UtfString &Bu::UtfString::operator+=( const UtfChar &rhs )
506{ 506{
507 append( rhs ); 507 append( rhs );
508 return *this; 508 return *this;
509} 509}
510 510
511Bu::String Bu::UtfString::get( Encoding eEnc ) const 511Bu::String Bu::UtfString::get( Encoding eEnc ) const
512{ 512{
513 Bu::MemBuf mb; 513 Bu::MemBuf mb;
514 write( mb, eEnc ); 514 write( mb, eEnc );
515 return mb.getString(); 515 return mb.getString();
516} 516}
517 517
518void Bu::UtfString::debug() const 518void Bu::UtfString::debug() const
519{ 519{
520 sio << "Raw Utf16: "; 520 sio << "Raw Utf16: ";
521 for( int i = 0; i < aData.getSize(); i++ ) 521 for( int i = 0; i < aData.getSize(); i++ )
522 { 522 {
523 if( i > 0 ) 523 if( i > 0 )
524 sio << ", "; 524 sio << ", ";
525 sio << "0x" << Fmt::hex() << aData[i]; 525 sio << "0x" << Fmt::hex() << aData[i];
526 } 526 }
527 sio << sio.nl; 527 sio << sio.nl;
528 sio << "Code Points: "; 528 sio << "Code Points: ";
529 for( int i = 0; i < aData.getSize(); i++ ) 529 for( int i = 0; i < aData.getSize(); i++ )
530 { 530 {
531 if( i > 0 ) 531 if( i > 0 )
532 sio << ", "; 532 sio << ", ";
533 sio << "0x" << Fmt::hex() << nextChar( i ); 533 sio << "0x" << Fmt::hex() << nextChar( i );
534 } 534 }
535 sio << sio.nl; 535 sio << sio.nl;
536} 536}
537/* 537/*
538void Bu::UtfString::debugUtf8( const Bu::String &sUtf8 ) 538void Bu::UtfString::debugUtf8( const Bu::String &sUtf8 )
539{ 539{
540 static uint8_t lmask[8] = { 540 static uint8_t lmask[8] = {
541 0x00, 541 0x00,
542 0x01, 542 0x01,
543 0x03, 543 0x03,
544 0x07, 544 0x07,
545 0x0f, 545 0x0f,
546 0x1f, 546 0x1f,
547 0x3f, 547 0x3f,
548 0x7f 548 0x7f
549 }; 549 };
550 for( Bu::String::const_iterator i = sUtf8.begin(); i; i++ ) 550 for( Bu::String::const_iterator i = sUtf8.begin(); i; i++ )
551 { 551 {
552 if( i != sUtf8.begin() ) 552 if( i != sUtf8.begin() )
553 sio << ", "; 553 sio << ", ";
554 if( ((int)(uint8_t)*i)&0x80 ) 554 if( ((int)(uint8_t)*i)&0x80 )
555 { 555 {
556// sio << "Flag byte: " << Bu::Fmt().radix(2).width(8).fill('0') 556// sio << "Flag byte: " << Bu::Fmt().radix(2).width(8).fill('0')
557// << (int)(uint8_t)*i << sio.nl; 557// << (int)(uint8_t)*i << sio.nl;
558 int iBytes = 1; 558 int iBytes = 1;
559 for(; (((uint8_t)(*i))<<iBytes)&0x80; iBytes++ ) { } 559 for(; (((uint8_t)(*i))<<iBytes)&0x80; iBytes++ ) { }
560// sio << "iBytes = " << iBytes << sio.nl; 560// sio << "iBytes = " << iBytes << sio.nl;
561 Bu::UtfChar uPt = ((*i) & lmask[7-iBytes])<<(6*(iBytes-1)); 561 Bu::UtfChar uPt = ((*i) & lmask[7-iBytes])<<(6*(iBytes-1));
562// sio << "mask: " << Bu::Fmt().radix(2).width(8).fill('0') 562// sio << "mask: " << Bu::Fmt().radix(2).width(8).fill('0')
563// << (int)lmask[7-iBytes] << sio.nl; 563// << (int)lmask[7-iBytes] << sio.nl;
564 for( iBytes--; iBytes >= 1; iBytes-- ) 564 for( iBytes--; iBytes >= 1; iBytes-- )
565 { 565 {
566// sio << "iBytes = " << iBytes << ", shift = " << (6*(iBytes-1)) 566// sio << "iBytes = " << iBytes << ", shift = " << (6*(iBytes-1))
567// << sio.nl; 567// << sio.nl;
568// sio << "next: " << Bu::Fmt().radix(2).width(8).fill('0') 568// sio << "next: " << Bu::Fmt().radix(2).width(8).fill('0')
569// << (int)(uint8_t)*i << sio.nl 569// << (int)(uint8_t)*i << sio.nl
570// << "mask: " << Bu::Fmt().radix(2).width(8).fill('0') 570// << "mask: " << Bu::Fmt().radix(2).width(8).fill('0')
571// << (int)lmask[6] << sio.nl; 571// << (int)lmask[6] << sio.nl;
572 i++; 572 i++;
573 uPt |= ((*i)&lmask[6])<<(6*(iBytes-1)); 573 uPt |= ((*i)&lmask[6])<<(6*(iBytes-1));
574 } 574 }
575 sio << uPt; 575 sio << uPt;
576// sio << " (" << Bu::Fmt( 8, 2 ).fill('0') 576// sio << " (" << Bu::Fmt( 8, 2 ).fill('0')
577// << uPt << ")"; 577// << uPt << ")";
578 } 578 }
579 else 579 else
580 { 580 {
581 sio << (int)((uint8_t)*i); 581 sio << (int)((uint8_t)*i);
582 } 582 }
583 } 583 }
584 sio << sio.nl; 584 sio << sio.nl;
585} 585}
586*/ 586*/
587 587
588template<> uint32_t Bu::__calcHashCode<Bu::UtfString>( const Bu::UtfString &k ) 588template<> uint32_t Bu::__calcHashCode<Bu::UtfString>( const Bu::UtfString &k )
589{ 589{
590 uint32_t uCode = 0; 590 uint32_t uCode = 0;
591 591
592 for( Bu::UtfString::const_iterator i = k.begin(); i; i++ ) 592 for( Bu::UtfString::const_iterator i = k.begin(); i; i++ )
593 { 593 {
594 uCode = *i + (uCode<<6) + (uCode<<16) - uCode; 594 uCode = *i + (uCode<<6) + (uCode<<16) - uCode;
595 } 595 }
596 596
597 return uCode; 597 return uCode;
598} 598}
599 599
600template<> bool Bu::__cmpHashKeys<Bu::UtfString>( 600template<> bool Bu::__cmpHashKeys<Bu::UtfString>(
601 const Bu::UtfString &a, const Bu::UtfString &b ) 601 const Bu::UtfString &a, const Bu::UtfString &b )
602{ 602{
603 return a == b; 603 return a == b;
604} 604}