diff options
Diffstat (limited to '')
-rw-r--r-- | src/unstable/utfstring.cpp | 255 |
1 files changed, 229 insertions, 26 deletions
diff --git a/src/unstable/utfstring.cpp b/src/unstable/utfstring.cpp index f945725..46c78e6 100644 --- a/src/unstable/utfstring.cpp +++ b/src/unstable/utfstring.cpp | |||
@@ -12,8 +12,21 @@ | |||
12 | #include "bu/config.h" | 12 | #include "bu/config.h" |
13 | #include "bu/sio.h" | 13 | #include "bu/sio.h" |
14 | #include "bu/membuf.h" | 14 | #include "bu/membuf.h" |
15 | #include "bu/formatter.h" | ||
16 | |||
15 | using Bu::sio; | 17 | using Bu::sio; |
16 | 18 | ||
19 | uint8_t Bu::UtfString::utf8_lmask[8] = { | ||
20 | 0x00, | ||
21 | 0x01, | ||
22 | 0x03, | ||
23 | 0x07, | ||
24 | 0x0f, | ||
25 | 0x1f, | ||
26 | 0x3f, | ||
27 | 0x7f | ||
28 | }; | ||
29 | |||
17 | Bu::UtfString::UtfString() | 30 | Bu::UtfString::UtfString() |
18 | { | 31 | { |
19 | } | 32 | } |
@@ -111,27 +124,17 @@ void Bu::UtfString::append( const UtfString &rSrc ) | |||
111 | 124 | ||
112 | void Bu::UtfString::setUtf8( const Bu::String &sInput ) | 125 | void Bu::UtfString::setUtf8( const Bu::String &sInput ) |
113 | { | 126 | { |
114 | static uint8_t lmask[8] = { | ||
115 | 0x00, | ||
116 | 0x01, | ||
117 | 0x03, | ||
118 | 0x07, | ||
119 | 0x0f, | ||
120 | 0x1f, | ||
121 | 0x3f, | ||
122 | 0x7f | ||
123 | }; | ||
124 | for( Bu::String::const_iterator i = sInput.begin(); i; i++ ) | 127 | for( Bu::String::const_iterator i = sInput.begin(); i; i++ ) |
125 | { | 128 | { |
126 | if( ((int)(uint8_t)*i)&0x80 ) | 129 | if( ((int)(uint8_t)*i)&0x80 ) |
127 | { | 130 | { |
128 | int iBytes = 1; | 131 | int iBytes = 1; |
129 | for(; (((uint8_t)(*i))<<iBytes)&0x80; iBytes++ ) { } | 132 | for(; (((uint8_t)(*i))<<iBytes)&0x80; iBytes++ ) { } |
130 | Bu::UtfChar uPt = ((*i) & lmask[7-iBytes])<<(6*(iBytes-1)); | 133 | Bu::UtfChar uPt = ((*i) & utf8_lmask[7-iBytes])<<(6*(iBytes-1)); |
131 | for( iBytes--; iBytes >= 1; iBytes-- ) | 134 | for( iBytes--; iBytes >= 1; iBytes-- ) |
132 | { | 135 | { |
133 | i++; | 136 | i++; |
134 | uPt |= ((*i)&lmask[6])<<(6*(iBytes-1)); | 137 | uPt |= ((*i)&utf8_lmask[6])<<(6*(iBytes-1)); |
135 | } | 138 | } |
136 | append( uPt ); | 139 | append( uPt ); |
137 | } | 140 | } |
@@ -321,6 +324,133 @@ void Bu::UtfString::write( Bu::Stream &sOut, Encoding eEnc ) const | |||
321 | } | 324 | } |
322 | } | 325 | } |
323 | 326 | ||
327 | int Bu::UtfString::readPoint( Bu::Stream &sIn, Bu::UtfChar &c, | ||
328 | Bu::UtfString::Encoding sEnc ) | ||
329 | { | ||
330 | switch( sEnc ) | ||
331 | { | ||
332 | case Utf8: | ||
333 | { | ||
334 | uint8_t i; | ||
335 | int iRead = 1; | ||
336 | if( sIn.read( &i, 1 ) < 1 ) | ||
337 | return 0; | ||
338 | if( ((int)i)&0x80 ) | ||
339 | { | ||
340 | int iBytes = 1; | ||
341 | for(; (((uint8_t)i)<<iBytes)&0x80; iBytes++ ) { } | ||
342 | iRead = iBytes; | ||
343 | c = (i & utf8_lmask[7-iBytes])<<(6*(iBytes-1)); | ||
344 | for( iBytes--; iBytes >= 1; iBytes-- ) | ||
345 | { | ||
346 | if( sIn.read( &i, 1 ) < 1 ) | ||
347 | return 0; | ||
348 | c |= (i&utf8_lmask[6])<<(6*(iBytes-1)); | ||
349 | } | ||
350 | return iRead; | ||
351 | } | ||
352 | else | ||
353 | { | ||
354 | c = (Bu::UtfChar)i; | ||
355 | return 1; | ||
356 | } | ||
357 | } | ||
358 | break; | ||
359 | |||
360 | case Utf16: | ||
361 | case Utf16be: | ||
362 | case Utf16le: | ||
363 | case Utf32: | ||
364 | case Utf32be: | ||
365 | case Utf32le: | ||
366 | case Ucs2: | ||
367 | case Ucs4: | ||
368 | case GuessEncoding: | ||
369 | throw Bu::ExceptionBase("Not implemented."); | ||
370 | break; | ||
371 | } | ||
372 | return -1; | ||
373 | } | ||
374 | |||
375 | int Bu::UtfString::writePoint( Bu::Stream &sOut, const Bu::UtfChar &c, | ||
376 | Bu::UtfString::Encoding sEnc ) | ||
377 | { | ||
378 | switch( sEnc ) | ||
379 | { | ||
380 | case Utf8: | ||
381 | { | ||
382 | uint8_t uByte; | ||
383 | if( c >= 0x010000 ) | ||
384 | { | ||
385 | // Four bytes | ||
386 | // 111 111111 111111 111111 | ||
387 | uByte = (c>>18)|0xF0; | ||
388 | sOut.write( &uByte, 1 ); | ||
389 | uByte = ((c>>12)&0x3F)|0x80; | ||
390 | sOut.write( &uByte, 1 ); | ||
391 | uByte = ((c>>6)&0x3F)|0x80; | ||
392 | sOut.write( &uByte, 1 ); | ||
393 | uByte = (c&0x3F)|0x80; | ||
394 | sOut.write( &uByte, 1 ); | ||
395 | return 4; | ||
396 | } | ||
397 | else if( c >= 0x800 ) | ||
398 | { | ||
399 | // Three bytes | ||
400 | // 1111 111111 111111 | ||
401 | uByte = (c>>12)|0xE0; | ||
402 | sOut.write( &uByte, 1 ); | ||
403 | uByte = ((c>>6)&0x3F)|0x80; | ||
404 | sOut.write( &uByte, 1 ); | ||
405 | uByte = (c&0x3F)|0x80; | ||
406 | sOut.write( &uByte, 1 ); | ||
407 | return 3; | ||
408 | } | ||
409 | else if( c >= 0x80 ) | ||
410 | { | ||
411 | // Two bytes | ||
412 | // 11111 111111 | ||
413 | uByte = (c>>6)|0xC0; | ||
414 | sOut.write( &uByte, 1 ); | ||
415 | uByte = (c&0x3F)|0x80; | ||
416 | sOut.write( &uByte, 1 ); | ||
417 | return 2; | ||
418 | } | ||
419 | else | ||
420 | { | ||
421 | // One byte | ||
422 | uByte = c; | ||
423 | sOut.write( &uByte, 1 ); | ||
424 | return 1; | ||
425 | } | ||
426 | } | ||
427 | break; | ||
428 | |||
429 | case Utf16: | ||
430 | case Utf16be: | ||
431 | case Utf16le: | ||
432 | case Utf32: | ||
433 | case Utf32be: | ||
434 | case Utf32le: | ||
435 | case Ucs2: | ||
436 | case Ucs4: | ||
437 | case GuessEncoding: | ||
438 | throw Bu::ExceptionBase("Not implemented."); | ||
439 | break; | ||
440 | } | ||
441 | return -1; | ||
442 | } | ||
443 | |||
444 | int32_t Bu::UtfString::toInt32( int iRadix ) const | ||
445 | { | ||
446 | return strtol( get().getStr(), NULL, iRadix ); | ||
447 | } | ||
448 | |||
449 | int64_t Bu::UtfString::toInt64( int iRadix ) const | ||
450 | { | ||
451 | return strtoll( get().getStr(), NULL, iRadix ); | ||
452 | } | ||
453 | |||
324 | void Bu::UtfString::writeUtf8( Bu::Stream &sOut ) const | 454 | void Bu::UtfString::writeUtf8( Bu::Stream &sOut ) const |
325 | { | 455 | { |
326 | int iPos = 0; | 456 | int iPos = 0; |
@@ -496,6 +626,33 @@ bool Bu::UtfString::operator==( const Bu::UtfString &rhs ) const | |||
496 | return aData == rhs.aData; | 626 | return aData == rhs.aData; |
497 | } | 627 | } |
498 | 628 | ||
629 | bool Bu::UtfString::operator==( const Bu::String &rhs ) const | ||
630 | { | ||
631 | // Nieve comparison | ||
632 | if( aData.getSize() != rhs.getSize() ) | ||
633 | return false; | ||
634 | |||
635 | for( int j = 0; j < aData.getSize(); j++ ) | ||
636 | { | ||
637 | if( aData[j] != rhs[j] ) | ||
638 | return false; | ||
639 | } | ||
640 | |||
641 | return true; | ||
642 | } | ||
643 | |||
644 | bool Bu::UtfString::operator==( const char *rhs ) const | ||
645 | { | ||
646 | // Nieve comparison | ||
647 | for( int j = 0; j < aData.getSize(); j++ ) | ||
648 | { | ||
649 | if( rhs[j] == '\0' || aData[j] != rhs[j] ) | ||
650 | return false; | ||
651 | } | ||
652 | |||
653 | return true; | ||
654 | } | ||
655 | |||
499 | Bu::UtfString &Bu::UtfString::operator+=( const Bu::UtfString &rhs ) | 656 | Bu::UtfString &Bu::UtfString::operator+=( const Bu::UtfString &rhs ) |
500 | { | 657 | { |
501 | append( rhs ); | 658 | append( rhs ); |
@@ -508,6 +665,56 @@ Bu::UtfString &Bu::UtfString::operator+=( const UtfChar &rhs ) | |||
508 | return *this; | 665 | return *this; |
509 | } | 666 | } |
510 | 667 | ||
668 | bool Bu::UtfString::operator<( const Bu::UtfString &rhs ) const | ||
669 | { | ||
670 | for( int j = 0; j < aData.getSize() && j < rhs.aData.getSize(); j++ ) | ||
671 | { | ||
672 | if( aData[j] != rhs.aData[j] ) | ||
673 | return aData[j] < rhs.aData[j]; | ||
674 | } | ||
675 | |||
676 | return false; | ||
677 | } | ||
678 | |||
679 | bool Bu::UtfString::operator<=( const Bu::UtfString &rhs ) const | ||
680 | { | ||
681 | for( int j = 0; j < aData.getSize() && j < rhs.aData.getSize(); j++ ) | ||
682 | { | ||
683 | if( aData[j] != rhs.aData[j] ) | ||
684 | return aData[j] < rhs.aData[j]; | ||
685 | } | ||
686 | |||
687 | if( aData.getSize() == rhs.aData.getSize() ) | ||
688 | return true; | ||
689 | |||
690 | return false; | ||
691 | } | ||
692 | |||
693 | bool Bu::UtfString::operator>( const Bu::UtfString &rhs ) const | ||
694 | { | ||
695 | for( int j = 0; j < aData.getSize() && j < rhs.aData.getSize(); j++ ) | ||
696 | { | ||
697 | if( aData[j] != rhs.aData[j] ) | ||
698 | return aData[j] > rhs.aData[j]; | ||
699 | } | ||
700 | |||
701 | return false; | ||
702 | } | ||
703 | |||
704 | bool Bu::UtfString::operator>=( const Bu::UtfString &rhs ) const | ||
705 | { | ||
706 | for( int j = 0; j < aData.getSize() && j < rhs.aData.getSize(); j++ ) | ||
707 | { | ||
708 | if( aData[j] != rhs.aData[j] ) | ||
709 | return aData[j] > rhs.aData[j]; | ||
710 | } | ||
711 | |||
712 | if( aData.getSize() == rhs.aData.getSize() ) | ||
713 | return true; | ||
714 | |||
715 | return false; | ||
716 | } | ||
717 | |||
511 | Bu::String Bu::UtfString::get( Encoding eEnc ) const | 718 | Bu::String Bu::UtfString::get( Encoding eEnc ) const |
512 | { | 719 | { |
513 | Bu::MemBuf mb; | 720 | Bu::MemBuf mb; |
@@ -537,16 +744,6 @@ void Bu::UtfString::debug() const | |||
537 | /* | 744 | /* |
538 | void Bu::UtfString::debugUtf8( const Bu::String &sUtf8 ) | 745 | void Bu::UtfString::debugUtf8( const Bu::String &sUtf8 ) |
539 | { | 746 | { |
540 | static uint8_t lmask[8] = { | ||
541 | 0x00, | ||
542 | 0x01, | ||
543 | 0x03, | ||
544 | 0x07, | ||
545 | 0x0f, | ||
546 | 0x1f, | ||
547 | 0x3f, | ||
548 | 0x7f | ||
549 | }; | ||
550 | for( Bu::String::const_iterator i = sUtf8.begin(); i; i++ ) | 747 | for( Bu::String::const_iterator i = sUtf8.begin(); i; i++ ) |
551 | { | 748 | { |
552 | if( i != sUtf8.begin() ) | 749 | if( i != sUtf8.begin() ) |
@@ -558,9 +755,9 @@ void Bu::UtfString::debugUtf8( const Bu::String &sUtf8 ) | |||
558 | int iBytes = 1; | 755 | int iBytes = 1; |
559 | for(; (((uint8_t)(*i))<<iBytes)&0x80; iBytes++ ) { } | 756 | for(; (((uint8_t)(*i))<<iBytes)&0x80; iBytes++ ) { } |
560 | // sio << "iBytes = " << iBytes << sio.nl; | 757 | // sio << "iBytes = " << iBytes << sio.nl; |
561 | Bu::UtfChar uPt = ((*i) & lmask[7-iBytes])<<(6*(iBytes-1)); | 758 | Bu::UtfChar uPt = ((*i) & utf8_lmask[7-iBytes])<<(6*(iBytes-1)); |
562 | // sio << "mask: " << Bu::Fmt().radix(2).width(8).fill('0') | 759 | // sio << "mask: " << Bu::Fmt().radix(2).width(8).fill('0') |
563 | // << (int)lmask[7-iBytes] << sio.nl; | 760 | // << (int)utf8_lmask[7-iBytes] << sio.nl; |
564 | for( iBytes--; iBytes >= 1; iBytes-- ) | 761 | for( iBytes--; iBytes >= 1; iBytes-- ) |
565 | { | 762 | { |
566 | // sio << "iBytes = " << iBytes << ", shift = " << (6*(iBytes-1)) | 763 | // sio << "iBytes = " << iBytes << ", shift = " << (6*(iBytes-1)) |
@@ -568,9 +765,9 @@ void Bu::UtfString::debugUtf8( const Bu::String &sUtf8 ) | |||
568 | // sio << "next: " << Bu::Fmt().radix(2).width(8).fill('0') | 765 | // sio << "next: " << Bu::Fmt().radix(2).width(8).fill('0') |
569 | // << (int)(uint8_t)*i << sio.nl | 766 | // << (int)(uint8_t)*i << sio.nl |
570 | // << "mask: " << Bu::Fmt().radix(2).width(8).fill('0') | 767 | // << "mask: " << Bu::Fmt().radix(2).width(8).fill('0') |
571 | // << (int)lmask[6] << sio.nl; | 768 | // << (int)utf8_lmask[6] << sio.nl; |
572 | i++; | 769 | i++; |
573 | uPt |= ((*i)&lmask[6])<<(6*(iBytes-1)); | 770 | uPt |= ((*i)&utf8_lmask[6])<<(6*(iBytes-1)); |
574 | } | 771 | } |
575 | sio << uPt; | 772 | sio << uPt; |
576 | // sio << " (" << Bu::Fmt( 8, 2 ).fill('0') | 773 | // sio << " (" << Bu::Fmt( 8, 2 ).fill('0') |
@@ -602,3 +799,9 @@ template<> bool Bu::__cmpHashKeys<Bu::UtfString>( | |||
602 | { | 799 | { |
603 | return a == b; | 800 | return a == b; |
604 | } | 801 | } |
802 | |||
803 | Bu::Formatter Bu::operator<<( Bu::Formatter &f, const Bu::UtfString &s ) | ||
804 | { | ||
805 | return f << s.get(); | ||
806 | } | ||
807 | |||