summaryrefslogtreecommitdiff
path: root/src/unstable/utfstring.cpp
diff options
context:
space:
mode:
authorMike Buland <eichlan@xagasoft.com>2019-05-13 19:47:19 -0700
committerMike Buland <eichlan@xagasoft.com>2019-05-13 19:47:19 -0700
commitd605d6c3c04c1e26121f9b1c5c1d2dbcc5f7bc37 (patch)
tree0cd21d420fc67ae757ec2475610c4624fd714363 /src/unstable/utfstring.cpp
parent62753c815b5ec34ebfae37a3c89187a01cc17160 (diff)
downloadlibbu++-d605d6c3c04c1e26121f9b1c5c1d2dbcc5f7bc37.tar.gz
libbu++-d605d6c3c04c1e26121f9b1c5c1d2dbcc5f7bc37.tar.bz2
libbu++-d605d6c3c04c1e26121f9b1c5c1d2dbcc5f7bc37.tar.xz
libbu++-d605d6c3c04c1e26121f9b1c5c1d2dbcc5f7bc37.zip
UtfString & Json overhaul.
UtfString supports a load of new stuff, and Json uses UtfString exclusively now.
Diffstat (limited to 'src/unstable/utfstring.cpp')
-rw-r--r--src/unstable/utfstring.cpp255
1 files changed, 229 insertions, 26 deletions
diff --git a/src/unstable/utfstring.cpp b/src/unstable/utfstring.cpp
index f945725..46c78e6 100644
--- a/src/unstable/utfstring.cpp
+++ b/src/unstable/utfstring.cpp
@@ -12,8 +12,21 @@
12#include "bu/config.h" 12#include "bu/config.h"
13#include "bu/sio.h" 13#include "bu/sio.h"
14#include "bu/membuf.h" 14#include "bu/membuf.h"
15#include "bu/formatter.h"
16
15using Bu::sio; 17using Bu::sio;
16 18
19uint8_t Bu::UtfString::utf8_lmask[8] = {
20 0x00,
21 0x01,
22 0x03,
23 0x07,
24 0x0f,
25 0x1f,
26 0x3f,
27 0x7f
28};
29
17Bu::UtfString::UtfString() 30Bu::UtfString::UtfString()
18{ 31{
19} 32}
@@ -111,27 +124,17 @@ void Bu::UtfString::append( const UtfString &rSrc )
111 124
112void Bu::UtfString::setUtf8( const Bu::String &sInput ) 125void Bu::UtfString::setUtf8( const Bu::String &sInput )
113{ 126{
114 static uint8_t lmask[8] = {
115 0x00,
116 0x01,
117 0x03,
118 0x07,
119 0x0f,
120 0x1f,
121 0x3f,
122 0x7f
123 };
124 for( Bu::String::const_iterator i = sInput.begin(); i; i++ ) 127 for( Bu::String::const_iterator i = sInput.begin(); i; i++ )
125 { 128 {
126 if( ((int)(uint8_t)*i)&0x80 ) 129 if( ((int)(uint8_t)*i)&0x80 )
127 { 130 {
128 int iBytes = 1; 131 int iBytes = 1;
129 for(; (((uint8_t)(*i))<<iBytes)&0x80; iBytes++ ) { } 132 for(; (((uint8_t)(*i))<<iBytes)&0x80; iBytes++ ) { }
130 Bu::UtfChar uPt = ((*i) & lmask[7-iBytes])<<(6*(iBytes-1)); 133 Bu::UtfChar uPt = ((*i) & utf8_lmask[7-iBytes])<<(6*(iBytes-1));
131 for( iBytes--; iBytes >= 1; iBytes-- ) 134 for( iBytes--; iBytes >= 1; iBytes-- )
132 { 135 {
133 i++; 136 i++;
134 uPt |= ((*i)&lmask[6])<<(6*(iBytes-1)); 137 uPt |= ((*i)&utf8_lmask[6])<<(6*(iBytes-1));
135 } 138 }
136 append( uPt ); 139 append( uPt );
137 } 140 }
@@ -321,6 +324,133 @@ void Bu::UtfString::write( Bu::Stream &sOut, Encoding eEnc ) const
321 } 324 }
322} 325}
323 326
327int Bu::UtfString::readPoint( Bu::Stream &sIn, Bu::UtfChar &c,
328 Bu::UtfString::Encoding sEnc )
329{
330 switch( sEnc )
331 {
332 case Utf8:
333 {
334 uint8_t i;
335 int iRead = 1;
336 if( sIn.read( &i, 1 ) < 1 )
337 return 0;
338 if( ((int)i)&0x80 )
339 {
340 int iBytes = 1;
341 for(; (((uint8_t)i)<<iBytes)&0x80; iBytes++ ) { }
342 iRead = iBytes;
343 c = (i & utf8_lmask[7-iBytes])<<(6*(iBytes-1));
344 for( iBytes--; iBytes >= 1; iBytes-- )
345 {
346 if( sIn.read( &i, 1 ) < 1 )
347 return 0;
348 c |= (i&utf8_lmask[6])<<(6*(iBytes-1));
349 }
350 return iRead;
351 }
352 else
353 {
354 c = (Bu::UtfChar)i;
355 return 1;
356 }
357 }
358 break;
359
360 case Utf16:
361 case Utf16be:
362 case Utf16le:
363 case Utf32:
364 case Utf32be:
365 case Utf32le:
366 case Ucs2:
367 case Ucs4:
368 case GuessEncoding:
369 throw Bu::ExceptionBase("Not implemented.");
370 break;
371 }
372 return -1;
373}
374
375int Bu::UtfString::writePoint( Bu::Stream &sOut, const Bu::UtfChar &c,
376 Bu::UtfString::Encoding sEnc )
377{
378 switch( sEnc )
379 {
380 case Utf8:
381 {
382 uint8_t uByte;
383 if( c >= 0x010000 )
384 {
385 // Four bytes
386 // 111 111111 111111 111111
387 uByte = (c>>18)|0xF0;
388 sOut.write( &uByte, 1 );
389 uByte = ((c>>12)&0x3F)|0x80;
390 sOut.write( &uByte, 1 );
391 uByte = ((c>>6)&0x3F)|0x80;
392 sOut.write( &uByte, 1 );
393 uByte = (c&0x3F)|0x80;
394 sOut.write( &uByte, 1 );
395 return 4;
396 }
397 else if( c >= 0x800 )
398 {
399 // Three bytes
400 // 1111 111111 111111
401 uByte = (c>>12)|0xE0;
402 sOut.write( &uByte, 1 );
403 uByte = ((c>>6)&0x3F)|0x80;
404 sOut.write( &uByte, 1 );
405 uByte = (c&0x3F)|0x80;
406 sOut.write( &uByte, 1 );
407 return 3;
408 }
409 else if( c >= 0x80 )
410 {
411 // Two bytes
412 // 11111 111111
413 uByte = (c>>6)|0xC0;
414 sOut.write( &uByte, 1 );
415 uByte = (c&0x3F)|0x80;
416 sOut.write( &uByte, 1 );
417 return 2;
418 }
419 else
420 {
421 // One byte
422 uByte = c;
423 sOut.write( &uByte, 1 );
424 return 1;
425 }
426 }
427 break;
428
429 case Utf16:
430 case Utf16be:
431 case Utf16le:
432 case Utf32:
433 case Utf32be:
434 case Utf32le:
435 case Ucs2:
436 case Ucs4:
437 case GuessEncoding:
438 throw Bu::ExceptionBase("Not implemented.");
439 break;
440 }
441 return -1;
442}
443
444int32_t Bu::UtfString::toInt32( int iRadix ) const
445{
446 return strtol( get().getStr(), NULL, iRadix );
447}
448
449int64_t Bu::UtfString::toInt64( int iRadix ) const
450{
451 return strtoll( get().getStr(), NULL, iRadix );
452}
453
324void Bu::UtfString::writeUtf8( Bu::Stream &sOut ) const 454void Bu::UtfString::writeUtf8( Bu::Stream &sOut ) const
325{ 455{
326 int iPos = 0; 456 int iPos = 0;
@@ -496,6 +626,33 @@ bool Bu::UtfString::operator==( const Bu::UtfString &rhs ) const
496 return aData == rhs.aData; 626 return aData == rhs.aData;
497} 627}
498 628
629bool Bu::UtfString::operator==( const Bu::String &rhs ) const
630{
631 // Nieve comparison
632 if( aData.getSize() != rhs.getSize() )
633 return false;
634
635 for( int j = 0; j < aData.getSize(); j++ )
636 {
637 if( aData[j] != rhs[j] )
638 return false;
639 }
640
641 return true;
642}
643
644bool Bu::UtfString::operator==( const char *rhs ) const
645{
646 // Nieve comparison
647 for( int j = 0; j < aData.getSize(); j++ )
648 {
649 if( rhs[j] == '\0' || aData[j] != rhs[j] )
650 return false;
651 }
652
653 return true;
654}
655
499Bu::UtfString &Bu::UtfString::operator+=( const Bu::UtfString &rhs ) 656Bu::UtfString &Bu::UtfString::operator+=( const Bu::UtfString &rhs )
500{ 657{
501 append( rhs ); 658 append( rhs );
@@ -508,6 +665,56 @@ Bu::UtfString &Bu::UtfString::operator+=( const UtfChar &rhs )
508 return *this; 665 return *this;
509} 666}
510 667
668bool Bu::UtfString::operator<( const Bu::UtfString &rhs ) const
669{
670 for( int j = 0; j < aData.getSize() && j < rhs.aData.getSize(); j++ )
671 {
672 if( aData[j] != rhs.aData[j] )
673 return aData[j] < rhs.aData[j];
674 }
675
676 return false;
677}
678
679bool Bu::UtfString::operator<=( const Bu::UtfString &rhs ) const
680{
681 for( int j = 0; j < aData.getSize() && j < rhs.aData.getSize(); j++ )
682 {
683 if( aData[j] != rhs.aData[j] )
684 return aData[j] < rhs.aData[j];
685 }
686
687 if( aData.getSize() == rhs.aData.getSize() )
688 return true;
689
690 return false;
691}
692
693bool Bu::UtfString::operator>( const Bu::UtfString &rhs ) const
694{
695 for( int j = 0; j < aData.getSize() && j < rhs.aData.getSize(); j++ )
696 {
697 if( aData[j] != rhs.aData[j] )
698 return aData[j] > rhs.aData[j];
699 }
700
701 return false;
702}
703
704bool Bu::UtfString::operator>=( const Bu::UtfString &rhs ) const
705{
706 for( int j = 0; j < aData.getSize() && j < rhs.aData.getSize(); j++ )
707 {
708 if( aData[j] != rhs.aData[j] )
709 return aData[j] > rhs.aData[j];
710 }
711
712 if( aData.getSize() == rhs.aData.getSize() )
713 return true;
714
715 return false;
716}
717
511Bu::String Bu::UtfString::get( Encoding eEnc ) const 718Bu::String Bu::UtfString::get( Encoding eEnc ) const
512{ 719{
513 Bu::MemBuf mb; 720 Bu::MemBuf mb;
@@ -537,16 +744,6 @@ void Bu::UtfString::debug() const
537/* 744/*
538void Bu::UtfString::debugUtf8( const Bu::String &sUtf8 ) 745void Bu::UtfString::debugUtf8( const Bu::String &sUtf8 )
539{ 746{
540 static uint8_t lmask[8] = {
541 0x00,
542 0x01,
543 0x03,
544 0x07,
545 0x0f,
546 0x1f,
547 0x3f,
548 0x7f
549 };
550 for( Bu::String::const_iterator i = sUtf8.begin(); i; i++ ) 747 for( Bu::String::const_iterator i = sUtf8.begin(); i; i++ )
551 { 748 {
552 if( i != sUtf8.begin() ) 749 if( i != sUtf8.begin() )
@@ -558,9 +755,9 @@ void Bu::UtfString::debugUtf8( const Bu::String &sUtf8 )
558 int iBytes = 1; 755 int iBytes = 1;
559 for(; (((uint8_t)(*i))<<iBytes)&0x80; iBytes++ ) { } 756 for(; (((uint8_t)(*i))<<iBytes)&0x80; iBytes++ ) { }
560// sio << "iBytes = " << iBytes << sio.nl; 757// sio << "iBytes = " << iBytes << sio.nl;
561 Bu::UtfChar uPt = ((*i) & lmask[7-iBytes])<<(6*(iBytes-1)); 758 Bu::UtfChar uPt = ((*i) & utf8_lmask[7-iBytes])<<(6*(iBytes-1));
562// sio << "mask: " << Bu::Fmt().radix(2).width(8).fill('0') 759// sio << "mask: " << Bu::Fmt().radix(2).width(8).fill('0')
563// << (int)lmask[7-iBytes] << sio.nl; 760// << (int)utf8_lmask[7-iBytes] << sio.nl;
564 for( iBytes--; iBytes >= 1; iBytes-- ) 761 for( iBytes--; iBytes >= 1; iBytes-- )
565 { 762 {
566// sio << "iBytes = " << iBytes << ", shift = " << (6*(iBytes-1)) 763// sio << "iBytes = " << iBytes << ", shift = " << (6*(iBytes-1))
@@ -568,9 +765,9 @@ void Bu::UtfString::debugUtf8( const Bu::String &sUtf8 )
568// sio << "next: " << Bu::Fmt().radix(2).width(8).fill('0') 765// sio << "next: " << Bu::Fmt().radix(2).width(8).fill('0')
569// << (int)(uint8_t)*i << sio.nl 766// << (int)(uint8_t)*i << sio.nl
570// << "mask: " << Bu::Fmt().radix(2).width(8).fill('0') 767// << "mask: " << Bu::Fmt().radix(2).width(8).fill('0')
571// << (int)lmask[6] << sio.nl; 768// << (int)utf8_lmask[6] << sio.nl;
572 i++; 769 i++;
573 uPt |= ((*i)&lmask[6])<<(6*(iBytes-1)); 770 uPt |= ((*i)&utf8_lmask[6])<<(6*(iBytes-1));
574 } 771 }
575 sio << uPt; 772 sio << uPt;
576// sio << " (" << Bu::Fmt( 8, 2 ).fill('0') 773// sio << " (" << Bu::Fmt( 8, 2 ).fill('0')
@@ -602,3 +799,9 @@ template<> bool Bu::__cmpHashKeys<Bu::UtfString>(
602{ 799{
603 return a == b; 800 return a == b;
604} 801}
802
803Bu::Formatter Bu::operator<<( Bu::Formatter &f, const Bu::UtfString &s )
804{
805 return f << s.get();
806}
807