diff options
| author | Mike Buland <eichlan@xagasoft.com> | 2019-05-13 19:47:19 -0700 |
|---|---|---|
| committer | Mike Buland <eichlan@xagasoft.com> | 2019-05-13 19:47:19 -0700 |
| commit | d605d6c3c04c1e26121f9b1c5c1d2dbcc5f7bc37 (patch) | |
| tree | 0cd21d420fc67ae757ec2475610c4624fd714363 /src/unstable/utfstring.cpp | |
| parent | 62753c815b5ec34ebfae37a3c89187a01cc17160 (diff) | |
| download | libbu++-d605d6c3c04c1e26121f9b1c5c1d2dbcc5f7bc37.tar.gz libbu++-d605d6c3c04c1e26121f9b1c5c1d2dbcc5f7bc37.tar.bz2 libbu++-d605d6c3c04c1e26121f9b1c5c1d2dbcc5f7bc37.tar.xz libbu++-d605d6c3c04c1e26121f9b1c5c1d2dbcc5f7bc37.zip | |
UtfString & Json overhaul.
UtfString supports a load of new stuff, and Json uses UtfString
exclusively now.
Diffstat (limited to '')
| -rw-r--r-- | src/unstable/utfstring.cpp | 255 |
1 files changed, 229 insertions, 26 deletions
diff --git a/src/unstable/utfstring.cpp b/src/unstable/utfstring.cpp index f945725..46c78e6 100644 --- a/src/unstable/utfstring.cpp +++ b/src/unstable/utfstring.cpp | |||
| @@ -12,8 +12,21 @@ | |||
| 12 | #include "bu/config.h" | 12 | #include "bu/config.h" |
| 13 | #include "bu/sio.h" | 13 | #include "bu/sio.h" |
| 14 | #include "bu/membuf.h" | 14 | #include "bu/membuf.h" |
| 15 | #include "bu/formatter.h" | ||
| 16 | |||
| 15 | using Bu::sio; | 17 | using Bu::sio; |
| 16 | 18 | ||
| 19 | uint8_t Bu::UtfString::utf8_lmask[8] = { | ||
| 20 | 0x00, | ||
| 21 | 0x01, | ||
| 22 | 0x03, | ||
| 23 | 0x07, | ||
| 24 | 0x0f, | ||
| 25 | 0x1f, | ||
| 26 | 0x3f, | ||
| 27 | 0x7f | ||
| 28 | }; | ||
| 29 | |||
| 17 | Bu::UtfString::UtfString() | 30 | Bu::UtfString::UtfString() |
| 18 | { | 31 | { |
| 19 | } | 32 | } |
| @@ -111,27 +124,17 @@ void Bu::UtfString::append( const UtfString &rSrc ) | |||
| 111 | 124 | ||
| 112 | void Bu::UtfString::setUtf8( const Bu::String &sInput ) | 125 | void Bu::UtfString::setUtf8( const Bu::String &sInput ) |
| 113 | { | 126 | { |
| 114 | static uint8_t lmask[8] = { | ||
| 115 | 0x00, | ||
| 116 | 0x01, | ||
| 117 | 0x03, | ||
| 118 | 0x07, | ||
| 119 | 0x0f, | ||
| 120 | 0x1f, | ||
| 121 | 0x3f, | ||
| 122 | 0x7f | ||
| 123 | }; | ||
| 124 | for( Bu::String::const_iterator i = sInput.begin(); i; i++ ) | 127 | for( Bu::String::const_iterator i = sInput.begin(); i; i++ ) |
| 125 | { | 128 | { |
| 126 | if( ((int)(uint8_t)*i)&0x80 ) | 129 | if( ((int)(uint8_t)*i)&0x80 ) |
| 127 | { | 130 | { |
| 128 | int iBytes = 1; | 131 | int iBytes = 1; |
| 129 | for(; (((uint8_t)(*i))<<iBytes)&0x80; iBytes++ ) { } | 132 | for(; (((uint8_t)(*i))<<iBytes)&0x80; iBytes++ ) { } |
| 130 | Bu::UtfChar uPt = ((*i) & lmask[7-iBytes])<<(6*(iBytes-1)); | 133 | Bu::UtfChar uPt = ((*i) & utf8_lmask[7-iBytes])<<(6*(iBytes-1)); |
| 131 | for( iBytes--; iBytes >= 1; iBytes-- ) | 134 | for( iBytes--; iBytes >= 1; iBytes-- ) |
| 132 | { | 135 | { |
| 133 | i++; | 136 | i++; |
| 134 | uPt |= ((*i)&lmask[6])<<(6*(iBytes-1)); | 137 | uPt |= ((*i)&utf8_lmask[6])<<(6*(iBytes-1)); |
| 135 | } | 138 | } |
| 136 | append( uPt ); | 139 | append( uPt ); |
| 137 | } | 140 | } |
| @@ -321,6 +324,133 @@ void Bu::UtfString::write( Bu::Stream &sOut, Encoding eEnc ) const | |||
| 321 | } | 324 | } |
| 322 | } | 325 | } |
| 323 | 326 | ||
| 327 | int Bu::UtfString::readPoint( Bu::Stream &sIn, Bu::UtfChar &c, | ||
| 328 | Bu::UtfString::Encoding sEnc ) | ||
| 329 | { | ||
| 330 | switch( sEnc ) | ||
| 331 | { | ||
| 332 | case Utf8: | ||
| 333 | { | ||
| 334 | uint8_t i; | ||
| 335 | int iRead = 1; | ||
| 336 | if( sIn.read( &i, 1 ) < 1 ) | ||
| 337 | return 0; | ||
| 338 | if( ((int)i)&0x80 ) | ||
| 339 | { | ||
| 340 | int iBytes = 1; | ||
| 341 | for(; (((uint8_t)i)<<iBytes)&0x80; iBytes++ ) { } | ||
| 342 | iRead = iBytes; | ||
| 343 | c = (i & utf8_lmask[7-iBytes])<<(6*(iBytes-1)); | ||
| 344 | for( iBytes--; iBytes >= 1; iBytes-- ) | ||
| 345 | { | ||
| 346 | if( sIn.read( &i, 1 ) < 1 ) | ||
| 347 | return 0; | ||
| 348 | c |= (i&utf8_lmask[6])<<(6*(iBytes-1)); | ||
| 349 | } | ||
| 350 | return iRead; | ||
| 351 | } | ||
| 352 | else | ||
| 353 | { | ||
| 354 | c = (Bu::UtfChar)i; | ||
| 355 | return 1; | ||
| 356 | } | ||
| 357 | } | ||
| 358 | break; | ||
| 359 | |||
| 360 | case Utf16: | ||
| 361 | case Utf16be: | ||
| 362 | case Utf16le: | ||
| 363 | case Utf32: | ||
| 364 | case Utf32be: | ||
| 365 | case Utf32le: | ||
| 366 | case Ucs2: | ||
| 367 | case Ucs4: | ||
| 368 | case GuessEncoding: | ||
| 369 | throw Bu::ExceptionBase("Not implemented."); | ||
| 370 | break; | ||
| 371 | } | ||
| 372 | return -1; | ||
| 373 | } | ||
| 374 | |||
| 375 | int Bu::UtfString::writePoint( Bu::Stream &sOut, const Bu::UtfChar &c, | ||
| 376 | Bu::UtfString::Encoding sEnc ) | ||
| 377 | { | ||
| 378 | switch( sEnc ) | ||
| 379 | { | ||
| 380 | case Utf8: | ||
| 381 | { | ||
| 382 | uint8_t uByte; | ||
| 383 | if( c >= 0x010000 ) | ||
| 384 | { | ||
| 385 | // Four bytes | ||
| 386 | // 111 111111 111111 111111 | ||
| 387 | uByte = (c>>18)|0xF0; | ||
| 388 | sOut.write( &uByte, 1 ); | ||
| 389 | uByte = ((c>>12)&0x3F)|0x80; | ||
| 390 | sOut.write( &uByte, 1 ); | ||
| 391 | uByte = ((c>>6)&0x3F)|0x80; | ||
| 392 | sOut.write( &uByte, 1 ); | ||
| 393 | uByte = (c&0x3F)|0x80; | ||
| 394 | sOut.write( &uByte, 1 ); | ||
| 395 | return 4; | ||
| 396 | } | ||
| 397 | else if( c >= 0x800 ) | ||
| 398 | { | ||
| 399 | // Three bytes | ||
| 400 | // 1111 111111 111111 | ||
| 401 | uByte = (c>>12)|0xE0; | ||
| 402 | sOut.write( &uByte, 1 ); | ||
| 403 | uByte = ((c>>6)&0x3F)|0x80; | ||
| 404 | sOut.write( &uByte, 1 ); | ||
| 405 | uByte = (c&0x3F)|0x80; | ||
| 406 | sOut.write( &uByte, 1 ); | ||
| 407 | return 3; | ||
| 408 | } | ||
| 409 | else if( c >= 0x80 ) | ||
| 410 | { | ||
| 411 | // Two bytes | ||
| 412 | // 11111 111111 | ||
| 413 | uByte = (c>>6)|0xC0; | ||
| 414 | sOut.write( &uByte, 1 ); | ||
| 415 | uByte = (c&0x3F)|0x80; | ||
| 416 | sOut.write( &uByte, 1 ); | ||
| 417 | return 2; | ||
| 418 | } | ||
| 419 | else | ||
| 420 | { | ||
| 421 | // One byte | ||
| 422 | uByte = c; | ||
| 423 | sOut.write( &uByte, 1 ); | ||
| 424 | return 1; | ||
| 425 | } | ||
| 426 | } | ||
| 427 | break; | ||
| 428 | |||
| 429 | case Utf16: | ||
| 430 | case Utf16be: | ||
| 431 | case Utf16le: | ||
| 432 | case Utf32: | ||
| 433 | case Utf32be: | ||
| 434 | case Utf32le: | ||
| 435 | case Ucs2: | ||
| 436 | case Ucs4: | ||
| 437 | case GuessEncoding: | ||
| 438 | throw Bu::ExceptionBase("Not implemented."); | ||
| 439 | break; | ||
| 440 | } | ||
| 441 | return -1; | ||
| 442 | } | ||
| 443 | |||
| 444 | int32_t Bu::UtfString::toInt32( int iRadix ) const | ||
| 445 | { | ||
| 446 | return strtol( get().getStr(), NULL, iRadix ); | ||
| 447 | } | ||
| 448 | |||
| 449 | int64_t Bu::UtfString::toInt64( int iRadix ) const | ||
| 450 | { | ||
| 451 | return strtoll( get().getStr(), NULL, iRadix ); | ||
| 452 | } | ||
| 453 | |||
| 324 | void Bu::UtfString::writeUtf8( Bu::Stream &sOut ) const | 454 | void Bu::UtfString::writeUtf8( Bu::Stream &sOut ) const |
| 325 | { | 455 | { |
| 326 | int iPos = 0; | 456 | int iPos = 0; |
| @@ -496,6 +626,33 @@ bool Bu::UtfString::operator==( const Bu::UtfString &rhs ) const | |||
| 496 | return aData == rhs.aData; | 626 | return aData == rhs.aData; |
| 497 | } | 627 | } |
| 498 | 628 | ||
| 629 | bool Bu::UtfString::operator==( const Bu::String &rhs ) const | ||
| 630 | { | ||
| 631 | // Nieve comparison | ||
| 632 | if( aData.getSize() != rhs.getSize() ) | ||
| 633 | return false; | ||
| 634 | |||
| 635 | for( int j = 0; j < aData.getSize(); j++ ) | ||
| 636 | { | ||
| 637 | if( aData[j] != rhs[j] ) | ||
| 638 | return false; | ||
| 639 | } | ||
| 640 | |||
| 641 | return true; | ||
| 642 | } | ||
| 643 | |||
| 644 | bool Bu::UtfString::operator==( const char *rhs ) const | ||
| 645 | { | ||
| 646 | // Nieve comparison | ||
| 647 | for( int j = 0; j < aData.getSize(); j++ ) | ||
| 648 | { | ||
| 649 | if( rhs[j] == '\0' || aData[j] != rhs[j] ) | ||
| 650 | return false; | ||
| 651 | } | ||
| 652 | |||
| 653 | return true; | ||
| 654 | } | ||
| 655 | |||
| 499 | Bu::UtfString &Bu::UtfString::operator+=( const Bu::UtfString &rhs ) | 656 | Bu::UtfString &Bu::UtfString::operator+=( const Bu::UtfString &rhs ) |
| 500 | { | 657 | { |
| 501 | append( rhs ); | 658 | append( rhs ); |
| @@ -508,6 +665,56 @@ Bu::UtfString &Bu::UtfString::operator+=( const UtfChar &rhs ) | |||
| 508 | return *this; | 665 | return *this; |
| 509 | } | 666 | } |
| 510 | 667 | ||
| 668 | bool Bu::UtfString::operator<( const Bu::UtfString &rhs ) const | ||
| 669 | { | ||
| 670 | for( int j = 0; j < aData.getSize() && j < rhs.aData.getSize(); j++ ) | ||
| 671 | { | ||
| 672 | if( aData[j] != rhs.aData[j] ) | ||
| 673 | return aData[j] < rhs.aData[j]; | ||
| 674 | } | ||
| 675 | |||
| 676 | return false; | ||
| 677 | } | ||
| 678 | |||
| 679 | bool Bu::UtfString::operator<=( const Bu::UtfString &rhs ) const | ||
| 680 | { | ||
| 681 | for( int j = 0; j < aData.getSize() && j < rhs.aData.getSize(); j++ ) | ||
| 682 | { | ||
| 683 | if( aData[j] != rhs.aData[j] ) | ||
| 684 | return aData[j] < rhs.aData[j]; | ||
| 685 | } | ||
| 686 | |||
| 687 | if( aData.getSize() == rhs.aData.getSize() ) | ||
| 688 | return true; | ||
| 689 | |||
| 690 | return false; | ||
| 691 | } | ||
| 692 | |||
| 693 | bool Bu::UtfString::operator>( const Bu::UtfString &rhs ) const | ||
| 694 | { | ||
| 695 | for( int j = 0; j < aData.getSize() && j < rhs.aData.getSize(); j++ ) | ||
| 696 | { | ||
| 697 | if( aData[j] != rhs.aData[j] ) | ||
| 698 | return aData[j] > rhs.aData[j]; | ||
| 699 | } | ||
| 700 | |||
| 701 | return false; | ||
| 702 | } | ||
| 703 | |||
| 704 | bool Bu::UtfString::operator>=( const Bu::UtfString &rhs ) const | ||
| 705 | { | ||
| 706 | for( int j = 0; j < aData.getSize() && j < rhs.aData.getSize(); j++ ) | ||
| 707 | { | ||
| 708 | if( aData[j] != rhs.aData[j] ) | ||
| 709 | return aData[j] > rhs.aData[j]; | ||
| 710 | } | ||
| 711 | |||
| 712 | if( aData.getSize() == rhs.aData.getSize() ) | ||
| 713 | return true; | ||
| 714 | |||
| 715 | return false; | ||
| 716 | } | ||
| 717 | |||
| 511 | Bu::String Bu::UtfString::get( Encoding eEnc ) const | 718 | Bu::String Bu::UtfString::get( Encoding eEnc ) const |
| 512 | { | 719 | { |
| 513 | Bu::MemBuf mb; | 720 | Bu::MemBuf mb; |
| @@ -537,16 +744,6 @@ void Bu::UtfString::debug() const | |||
| 537 | /* | 744 | /* |
| 538 | void Bu::UtfString::debugUtf8( const Bu::String &sUtf8 ) | 745 | void Bu::UtfString::debugUtf8( const Bu::String &sUtf8 ) |
| 539 | { | 746 | { |
| 540 | static uint8_t lmask[8] = { | ||
| 541 | 0x00, | ||
| 542 | 0x01, | ||
| 543 | 0x03, | ||
| 544 | 0x07, | ||
| 545 | 0x0f, | ||
| 546 | 0x1f, | ||
| 547 | 0x3f, | ||
| 548 | 0x7f | ||
| 549 | }; | ||
| 550 | for( Bu::String::const_iterator i = sUtf8.begin(); i; i++ ) | 747 | for( Bu::String::const_iterator i = sUtf8.begin(); i; i++ ) |
| 551 | { | 748 | { |
| 552 | if( i != sUtf8.begin() ) | 749 | if( i != sUtf8.begin() ) |
| @@ -558,9 +755,9 @@ void Bu::UtfString::debugUtf8( const Bu::String &sUtf8 ) | |||
| 558 | int iBytes = 1; | 755 | int iBytes = 1; |
| 559 | for(; (((uint8_t)(*i))<<iBytes)&0x80; iBytes++ ) { } | 756 | for(; (((uint8_t)(*i))<<iBytes)&0x80; iBytes++ ) { } |
| 560 | // sio << "iBytes = " << iBytes << sio.nl; | 757 | // sio << "iBytes = " << iBytes << sio.nl; |
| 561 | Bu::UtfChar uPt = ((*i) & lmask[7-iBytes])<<(6*(iBytes-1)); | 758 | Bu::UtfChar uPt = ((*i) & utf8_lmask[7-iBytes])<<(6*(iBytes-1)); |
| 562 | // sio << "mask: " << Bu::Fmt().radix(2).width(8).fill('0') | 759 | // sio << "mask: " << Bu::Fmt().radix(2).width(8).fill('0') |
| 563 | // << (int)lmask[7-iBytes] << sio.nl; | 760 | // << (int)utf8_lmask[7-iBytes] << sio.nl; |
| 564 | for( iBytes--; iBytes >= 1; iBytes-- ) | 761 | for( iBytes--; iBytes >= 1; iBytes-- ) |
| 565 | { | 762 | { |
| 566 | // sio << "iBytes = " << iBytes << ", shift = " << (6*(iBytes-1)) | 763 | // sio << "iBytes = " << iBytes << ", shift = " << (6*(iBytes-1)) |
| @@ -568,9 +765,9 @@ void Bu::UtfString::debugUtf8( const Bu::String &sUtf8 ) | |||
| 568 | // sio << "next: " << Bu::Fmt().radix(2).width(8).fill('0') | 765 | // sio << "next: " << Bu::Fmt().radix(2).width(8).fill('0') |
| 569 | // << (int)(uint8_t)*i << sio.nl | 766 | // << (int)(uint8_t)*i << sio.nl |
| 570 | // << "mask: " << Bu::Fmt().radix(2).width(8).fill('0') | 767 | // << "mask: " << Bu::Fmt().radix(2).width(8).fill('0') |
| 571 | // << (int)lmask[6] << sio.nl; | 768 | // << (int)utf8_lmask[6] << sio.nl; |
| 572 | i++; | 769 | i++; |
| 573 | uPt |= ((*i)&lmask[6])<<(6*(iBytes-1)); | 770 | uPt |= ((*i)&utf8_lmask[6])<<(6*(iBytes-1)); |
| 574 | } | 771 | } |
| 575 | sio << uPt; | 772 | sio << uPt; |
| 576 | // sio << " (" << Bu::Fmt( 8, 2 ).fill('0') | 773 | // sio << " (" << Bu::Fmt( 8, 2 ).fill('0') |
| @@ -602,3 +799,9 @@ template<> bool Bu::__cmpHashKeys<Bu::UtfString>( | |||
| 602 | { | 799 | { |
| 603 | return a == b; | 800 | return a == b; |
| 604 | } | 801 | } |
| 802 | |||
| 803 | Bu::Formatter Bu::operator<<( Bu::Formatter &f, const Bu::UtfString &s ) | ||
| 804 | { | ||
| 805 | return f << s.get(); | ||
| 806 | } | ||
| 807 | |||
