From d605d6c3c04c1e26121f9b1c5c1d2dbcc5f7bc37 Mon Sep 17 00:00:00 2001 From: Mike Buland Date: Mon, 13 May 2019 19:47:19 -0700 Subject: UtfString & Json overhaul. UtfString supports a load of new stuff, and Json uses UtfString exclusively now. --- src/unstable/utfstring.cpp | 255 ++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 229 insertions(+), 26 deletions(-) (limited to 'src/unstable/utfstring.cpp') diff --git a/src/unstable/utfstring.cpp b/src/unstable/utfstring.cpp index f945725..46c78e6 100644 --- a/src/unstable/utfstring.cpp +++ b/src/unstable/utfstring.cpp @@ -12,8 +12,21 @@ #include "bu/config.h" #include "bu/sio.h" #include "bu/membuf.h" +#include "bu/formatter.h" + using Bu::sio; +uint8_t Bu::UtfString::utf8_lmask[8] = { + 0x00, + 0x01, + 0x03, + 0x07, + 0x0f, + 0x1f, + 0x3f, + 0x7f +}; + Bu::UtfString::UtfString() { } @@ -111,27 +124,17 @@ void Bu::UtfString::append( const UtfString &rSrc ) void Bu::UtfString::setUtf8( const Bu::String &sInput ) { - static uint8_t lmask[8] = { - 0x00, - 0x01, - 0x03, - 0x07, - 0x0f, - 0x1f, - 0x3f, - 0x7f - }; for( Bu::String::const_iterator i = sInput.begin(); i; i++ ) { if( ((int)(uint8_t)*i)&0x80 ) { int iBytes = 1; for(; (((uint8_t)(*i))<= 1; iBytes-- ) { i++; - uPt |= ((*i)&lmask[6])<<(6*(iBytes-1)); + uPt |= ((*i)&utf8_lmask[6])<<(6*(iBytes-1)); } append( uPt ); } @@ -321,6 +324,133 @@ void Bu::UtfString::write( Bu::Stream &sOut, Encoding eEnc ) const } } +int Bu::UtfString::readPoint( Bu::Stream &sIn, Bu::UtfChar &c, + Bu::UtfString::Encoding sEnc ) +{ + switch( sEnc ) + { + case Utf8: + { + uint8_t i; + int iRead = 1; + if( sIn.read( &i, 1 ) < 1 ) + return 0; + if( ((int)i)&0x80 ) + { + int iBytes = 1; + for(; (((uint8_t)i)<= 1; iBytes-- ) + { + if( sIn.read( &i, 1 ) < 1 ) + return 0; + c |= (i&utf8_lmask[6])<<(6*(iBytes-1)); + } + return iRead; + } + else + { + c = (Bu::UtfChar)i; + return 1; + } + } + break; + + case Utf16: + case Utf16be: + case Utf16le: + case Utf32: + case Utf32be: + case Utf32le: + case Ucs2: + case Ucs4: + case GuessEncoding: + throw Bu::ExceptionBase("Not implemented."); + break; + } + return -1; +} + +int Bu::UtfString::writePoint( Bu::Stream &sOut, const Bu::UtfChar &c, + Bu::UtfString::Encoding sEnc ) +{ + switch( sEnc ) + { + case Utf8: + { + uint8_t uByte; + if( c >= 0x010000 ) + { + // Four bytes + // 111 111111 111111 111111 + uByte = (c>>18)|0xF0; + sOut.write( &uByte, 1 ); + uByte = ((c>>12)&0x3F)|0x80; + sOut.write( &uByte, 1 ); + uByte = ((c>>6)&0x3F)|0x80; + sOut.write( &uByte, 1 ); + uByte = (c&0x3F)|0x80; + sOut.write( &uByte, 1 ); + return 4; + } + else if( c >= 0x800 ) + { + // Three bytes + // 1111 111111 111111 + uByte = (c>>12)|0xE0; + sOut.write( &uByte, 1 ); + uByte = ((c>>6)&0x3F)|0x80; + sOut.write( &uByte, 1 ); + uByte = (c&0x3F)|0x80; + sOut.write( &uByte, 1 ); + return 3; + } + else if( c >= 0x80 ) + { + // Two bytes + // 11111 111111 + uByte = (c>>6)|0xC0; + sOut.write( &uByte, 1 ); + uByte = (c&0x3F)|0x80; + sOut.write( &uByte, 1 ); + return 2; + } + else + { + // One byte + uByte = c; + sOut.write( &uByte, 1 ); + return 1; + } + } + break; + + case Utf16: + case Utf16be: + case Utf16le: + case Utf32: + case Utf32be: + case Utf32le: + case Ucs2: + case Ucs4: + case GuessEncoding: + throw Bu::ExceptionBase("Not implemented."); + break; + } + return -1; +} + +int32_t Bu::UtfString::toInt32( int iRadix ) const +{ + return strtol( get().getStr(), NULL, iRadix ); +} + +int64_t Bu::UtfString::toInt64( int iRadix ) const +{ + return strtoll( get().getStr(), NULL, iRadix ); +} + void Bu::UtfString::writeUtf8( Bu::Stream &sOut ) const { int iPos = 0; @@ -496,6 +626,33 @@ bool Bu::UtfString::operator==( const Bu::UtfString &rhs ) const return aData == rhs.aData; } +bool Bu::UtfString::operator==( const Bu::String &rhs ) const +{ + // Nieve comparison + if( aData.getSize() != rhs.getSize() ) + return false; + + for( int j = 0; j < aData.getSize(); j++ ) + { + if( aData[j] != rhs[j] ) + return false; + } + + return true; +} + +bool Bu::UtfString::operator==( const char *rhs ) const +{ + // Nieve comparison + for( int j = 0; j < aData.getSize(); j++ ) + { + if( rhs[j] == '\0' || aData[j] != rhs[j] ) + return false; + } + + return true; +} + Bu::UtfString &Bu::UtfString::operator+=( const Bu::UtfString &rhs ) { append( rhs ); @@ -508,6 +665,56 @@ Bu::UtfString &Bu::UtfString::operator+=( const UtfChar &rhs ) return *this; } +bool Bu::UtfString::operator<( const Bu::UtfString &rhs ) const +{ + for( int j = 0; j < aData.getSize() && j < rhs.aData.getSize(); j++ ) + { + if( aData[j] != rhs.aData[j] ) + return aData[j] < rhs.aData[j]; + } + + return false; +} + +bool Bu::UtfString::operator<=( const Bu::UtfString &rhs ) const +{ + for( int j = 0; j < aData.getSize() && j < rhs.aData.getSize(); j++ ) + { + if( aData[j] != rhs.aData[j] ) + return aData[j] < rhs.aData[j]; + } + + if( aData.getSize() == rhs.aData.getSize() ) + return true; + + return false; +} + +bool Bu::UtfString::operator>( const Bu::UtfString &rhs ) const +{ + for( int j = 0; j < aData.getSize() && j < rhs.aData.getSize(); j++ ) + { + if( aData[j] != rhs.aData[j] ) + return aData[j] > rhs.aData[j]; + } + + return false; +} + +bool Bu::UtfString::operator>=( const Bu::UtfString &rhs ) const +{ + for( int j = 0; j < aData.getSize() && j < rhs.aData.getSize(); j++ ) + { + if( aData[j] != rhs.aData[j] ) + return aData[j] > rhs.aData[j]; + } + + if( aData.getSize() == rhs.aData.getSize() ) + return true; + + return false; +} + Bu::String Bu::UtfString::get( Encoding eEnc ) const { Bu::MemBuf mb; @@ -537,16 +744,6 @@ void Bu::UtfString::debug() const /* void Bu::UtfString::debugUtf8( const Bu::String &sUtf8 ) { - static uint8_t lmask[8] = { - 0x00, - 0x01, - 0x03, - 0x07, - 0x0f, - 0x1f, - 0x3f, - 0x7f - }; for( Bu::String::const_iterator i = sUtf8.begin(); i; i++ ) { if( i != sUtf8.begin() ) @@ -558,9 +755,9 @@ void Bu::UtfString::debugUtf8( const Bu::String &sUtf8 ) int iBytes = 1; for(; (((uint8_t)(*i))<= 1; iBytes-- ) { // sio << "iBytes = " << iBytes << ", shift = " << (6*(iBytes-1)) @@ -568,9 +765,9 @@ void Bu::UtfString::debugUtf8( const Bu::String &sUtf8 ) // sio << "next: " << Bu::Fmt().radix(2).width(8).fill('0') // << (int)(uint8_t)*i << sio.nl // << "mask: " << Bu::Fmt().radix(2).width(8).fill('0') -// << (int)lmask[6] << sio.nl; +// << (int)utf8_lmask[6] << sio.nl; i++; - uPt |= ((*i)&lmask[6])<<(6*(iBytes-1)); + uPt |= ((*i)&utf8_lmask[6])<<(6*(iBytes-1)); } sio << uPt; // sio << " (" << Bu::Fmt( 8, 2 ).fill('0') @@ -602,3 +799,9 @@ template<> bool Bu::__cmpHashKeys( { return a == b; } + +Bu::Formatter Bu::operator<<( Bu::Formatter &f, const Bu::UtfString &s ) +{ + return f << s.get(); +} + -- cgit v1.2.3