From 6ff23ac5097f7d92ac8840c2ba17d1dbf1eb80a5 Mon Sep 17 00:00:00 2001 From: Mike Buland Date: Mon, 4 Apr 2011 14:59:13 +0000 Subject: UtfString is going really well. It can now parse Utf8, Utf16 (le,be), and Utf32 (le,be). The internal storage seems to be working fine, although we do have a problem with random access, but at least we can tell which half of a surrogate pair we're on, so we can always rapidly determine the entire code point from any utf16 index that we're on. The only optomization that I'm not doing yet is reading in entire 16bit or 32bit words at a time and converting them from their byte order to native. There are a few potential issues with that, so we'll see. I added a couple of testing datafiles and a test program, I'll delete them all just as soon as it's verified to write correctly. --- src/utfstring.cpp | 240 ++++++++++++++++++++++++++++++++++++++++++++++++++---- src/utfstring.h | 52 +++++++++++- test.utf16 | 1 + test.utf16be | 1 + test.utf16le | 1 + utf16.cpp | 42 ++++++++++ 6 files changed, 319 insertions(+), 18 deletions(-) create mode 100644 test.utf16 create mode 100644 test.utf16be create mode 100644 test.utf16le create mode 100644 utf16.cpp diff --git a/src/utfstring.cpp b/src/utfstring.cpp index bb0a011..7c4ba19 100644 --- a/src/utfstring.cpp +++ b/src/utfstring.cpp @@ -8,9 +8,13 @@ #include "bu/utfstring.h" #include "bu/string.h" +#include "bu/stream.h" #include +#include "bu/sio.h" +using Bu::sio; + Bu::UtfString::UtfString() { } @@ -33,20 +37,35 @@ void Bu::UtfString::set( const Bu::String &sInput, Encoding eEnc ) break; case Utf16: - case Utf16be: setUtf16( sInput ); break; + case Utf16be: + setUtf16be( sInput ); + break; + case Utf16le: - throw Bu::ExceptionBase("Utf16le not supported yet."); + setUtf16le( sInput ); break; case Utf32: - throw Bu::ExceptionBase("Utf32 not supported yet."); + setUtf32( sInput ); + break; + + case Utf32be: + setUtf32be( sInput ); + break; + + case Utf32le: + setUtf32le( sInput ); + break; + + case Ucs2: + throw Bu::ExceptionBase("Ucs2 not supported yet."); break; - case Ucs16: - throw Bu::ExceptionBase("Ucs16 not supported yet."); + case Ucs4: + throw Bu::ExceptionBase("Ucs4 not supported yet."); break; case GuessEncoding: @@ -104,8 +123,32 @@ void Bu::UtfString::setUtf8( const Bu::String &sInput ) void Bu::UtfString::setUtf16( const Bu::String &sInput ) { + Bu::String::const_iterator i = sInput.begin(); + if( (uint8_t)*sInput.begin() == 0xFF && + (uint8_t)*(sInput.begin()+1) == 0xFE ) + { + setUtf16le( sInput ); + return; + } + setUtf16be( sInput ); +} + +void Bu::UtfString::setUtf16be( const Bu::String &sInput ) +{ + Bu::String::const_iterator i = sInput.begin(); + if( (uint8_t)*sInput.begin() == 0xFE && + (uint8_t)*(sInput.begin()+1) == 0xFF ) + + { + i += 2; + sio << "Verified big endian." << sio.nl; + } + else + { + sio << "Assuming big endian." << sio.nl; + } uint16_t hi, lo; - for( Bu::String::const_iterator i = sInput.begin(); i; i++ ) + for( ; i; i++ ) { hi = (((uint8_t)*i)<<8) | ((uint8_t)*(++i)); append16( hi ); @@ -117,25 +160,192 @@ void Bu::UtfString::setUtf16( const Bu::String &sInput ) } } -#include "bu/sio.h" -using Bu::sio; +void Bu::UtfString::setUtf16le( const Bu::String &sInput ) +{ + Bu::String::const_iterator i = sInput.begin(); + if( (uint8_t)*sInput.begin() == 0xFF && + (uint8_t)*(sInput.begin()+1) == 0xFE ) + { + i += 2; + sio << "Verified little endian." << sio.nl; + } + else + { + sio << "Assuming little endian." << sio.nl; + } + uint16_t hi, lo; + for( ; i; i++ ) + { + hi = (((uint8_t)*i)) | ((uint8_t)*(++i)<<8); + append16( hi ); + if( (hi&0xD800u) == 0xD800u ) + { + lo = (((uint8_t)*(++i))) | ((uint8_t)*(++i)<<8); + append16( lo ); + } + } +} + +void Bu::UtfString::setUtf32( const Bu::String &sInput ) +{ + Bu::String::const_iterator i = sInput.begin(); + if( (uint8_t)*i == 0x00 && + (uint8_t)*(++i) == 0x00 && + (uint8_t)*(++i) == 0xFF && + (uint8_t)*(++i) == 0xFE ) + { + setUtf32le( sInput ); + return; + } + setUtf32be( sInput ); +} + +void Bu::UtfString::setUtf32be( const Bu::String &sInput ) +{ + Bu::String::const_iterator i = sInput.begin(); + if( (uint8_t)*i == 0x00 && + (uint8_t)*(++i) == 0x00 && + (uint8_t)*(++i) == 0xFE && + (uint8_t)*(++i) == 0xFF ) + { + i++; + sio << "Verified big endian." << sio.nl; + } + else + { + i = sInput.begin(); + sio << "Assuming big endian." << sio.nl; + } + for( ; i; i++ ) + { + append( (((uint8_t)*i)<<24) | + (((uint8_t)*(++i))<<16) | + (((uint8_t)*(++i))<<8) | + ((uint8_t)*(++i)) + ); + } +} + +void Bu::UtfString::setUtf32le( const Bu::String &sInput ) +{ + Bu::String::const_iterator i = sInput.begin(); + if( (uint8_t)*i == 0x00 && + (uint8_t)*(++i) == 0x00 && + (uint8_t)*(++i) == 0xFF && + (uint8_t)*(++i) == 0xFE ) + { + i++; + sio << "Verified little endian." << sio.nl; + } + else + { + i = sInput.begin(); + sio << "Assuming little endian." << sio.nl; + } + for( ; i; i++ ) + { + append( ((uint8_t)*i) | + (((uint8_t)*(++i))<<8) | + (((uint8_t)*(++i))<<16) | + (((uint8_t)*(++i))<<24) + ); + } +} + +void Bu::UtfString::write( Bu::Stream &sOut, Encoding eEnc ) +{ + switch( eEnc ) + { + case Utf8: + writeUtf8( sOut ); + break; + + case Utf16: + writeUtf16( sOut ); + break; + + case Utf16be: + writeUtf16be( sOut ); + break; + + case Utf16le: + writeUtf16le( sOut ); + break; + + case Utf32: + writeUtf32( sOut ); + break; + + case Utf32be: + writeUtf32be( sOut ); + break; + + case Utf32le: + writeUtf32le( sOut ); + break; + + case Ucs2: + throw Bu::ExceptionBase("Ucs2 not supported yet."); + break; + + case Ucs4: + throw Bu::ExceptionBase("Ucs4 not supported yet."); + break; + + case GuessEncoding: + throw Bu::ExceptionBase( + "GuessEncoding is incompatible with encoding."); + break; + + } +} + +void Bu::UtfString::writeUtf8( Bu::Stream &sOut ) +{ +} + +void Bu::UtfString::writeUtf16( Bu::Stream &sOut ) +{ +} + +void Bu::UtfString::writeUtf16be( Bu::Stream &sOut ) +{ +} + +void Bu::UtfString::writeUtf16le( Bu::Stream &sOut ) +{ +} + +void Bu::UtfString::writeUtf32( Bu::Stream &sOut ) +{ +} + +void Bu::UtfString::writeUtf32be( Bu::Stream &sOut ) +{ +} + +void Bu::UtfString::writeUtf32le( Bu::Stream &sOut ) +{ +} Bu::UtfChar Bu::UtfString::get( int iIndex ) { - Bu::UtfChar i = aData[iIndex]; + return nextChar( iIndex ); +} + +Bu::UtfChar Bu::UtfString::nextChar( int &iIndex ) +{ + Bu::UtfChar i = aData[iIndex++]; switch( i&0xFC00 ) { case 0xD800: - sio << "(hi) "; - return (((i&0x3FF)<<10) | ((aData[iIndex+1]&0x3FF)))+0x10000; + return (((i&0x3FF)<<10) | ((aData[iIndex++]&0x3FF)))+0x10000; case 0xDC00: - sio << "(lo) "; - return 0; + return (((aData[iIndex-2]&0x3FF)<<10) | ((i&0x3FF)))+0x10000; default: - sio << "(--) "; - return i&0xFC00; + return i; } } diff --git a/src/utfstring.h b/src/utfstring.h index 79ef62e..8448ea4 100644 --- a/src/utfstring.h +++ b/src/utfstring.h @@ -14,6 +14,7 @@ namespace Bu { class String; + class Stream; /** * UtfChar isn't actually a character, unicode specifies "code points" not @@ -35,7 +36,10 @@ namespace Bu Utf16be, Utf16le, Utf32, - Ucs16, + Utf32be, + Utf32le, + Ucs2, + Ucs4, GuessEncoding }; @@ -43,17 +47,59 @@ namespace Bu UtfString( const Bu::String &sInput, Encoding eEnc=Utf8 ); virtual ~UtfString(); + class iterator + { + private: + iterator( UtfString *pSrc, int iCodePos ) : + pSrc( pSrc ), iCodePos( iCodePos ) + { + } + + public: + iterator() : + pSrc( NULL ), iCodePos( 0 ) + { + } + + UtfChar operator*() + { + if( !pSrc ) + throw Bu::ExceptionBase("invalid UtfString::iterator dereferenced."); + return pSrc->nextChar( iCodePos ); + } + + private: + UtfString *pSrc; + int iCodePos; + }; + void append( UtfChar ch ); void set( const Bu::String &sInput, Encoding eEnc=Utf8 ); void setUtf8( const Bu::String &sInput ); void setUtf16( const Bu::String &sInput ); -// void setUtf16be( const Bu::String &sInput ); -// void setUtf16le( const Bu::String &sInput ); + void setUtf16be( const Bu::String &sInput ); + void setUtf16le( const Bu::String &sInput ); + void setUtf32( const Bu::String &sInput ); + void setUtf32be( const Bu::String &sInput ); + void setUtf32le( const Bu::String &sInput ); + + void write( Bu::Stream &sOut, Encoding eEnc=Utf8 ); + void writeUtf8( Bu::Stream &sOut ); + void writeUtf16( Bu::Stream &sOut ); + void writeUtf16be( Bu::Stream &sOut ); + void writeUtf16le( Bu::Stream &sOut ); + void writeUtf32( Bu::Stream &sOut ); + void writeUtf32be( Bu::Stream &sOut ); + void writeUtf32le( Bu::Stream &sOut ); + + Bu::String to( Encoding eEnc=Utf8 ); + Bu::String toUtf8(); void debug(); UtfChar get( int iIndex ); + UtfChar nextChar( int &iIndex ); private: void append16( uint16_t i ) { aData.append( i ); } diff --git a/test.utf16 b/test.utf16 new file mode 100644 index 0000000..86a63c3 --- /dev/null +++ b/test.utf16 @@ -0,0 +1 @@ +¥Ëæ)˜Ø=Þ<Ûÿßý$H \ No newline at end of file diff --git a/test.utf16be b/test.utf16be new file mode 100644 index 0000000..136ad1a --- /dev/null +++ b/test.utf16be @@ -0,0 +1 @@ +þÿ¥Ëæ)˜Ø=Þ<Ûÿßý$H \ No newline at end of file diff --git a/test.utf16le b/test.utf16le new file mode 100644 index 0000000..9f610d6 --- /dev/null +++ b/test.utf16le @@ -0,0 +1 @@ +ÿþ¥Ëæ˜)=Ø<ÞÿÛýßH$ \ No newline at end of file diff --git a/utf16.cpp b/utf16.cpp new file mode 100644 index 0000000..eedb521 --- /dev/null +++ b/utf16.cpp @@ -0,0 +1,42 @@ +#include +#include + +void bitprint( uint16_t u ) +{ + for( int i = 15; i >= 0; i-- ) + printf("%c", (u&(1<= 0; i-- ) + printf("%c", (u&(1<>10)&0x3FF)| 0xD800u; + outLo = ((in-0x10000)&0x3FF)| 0xDC00u; + printf("0x%X == 0x%X, 0x%X\n", in, outHi, outLo ); +} + +int32_t utf16tou( uint16_t hi, uint16_t lo ) +{ + return (((uint32_t)hi&0x3FF)<<10 | lo&0x3FF)+0x10000; +} + +int main() +{ + bitprint( 0xD800u ); + bitprint( 0xDC00u ); + uint16_t hi, lo; + utoutf16( 0x1D11E, hi, lo ); // Cat face with wry smile + utoutf16( 0x10FFFD, hi, lo ); // Cat face with wry smile + utoutf16( 0x1F63C, hi, lo ); // Cat face with wry smile + bitprint( hi ); + bitprint( lo ); + printf("0x%X\n", utf16tou( hi, lo ) ); + return 0; +} -- cgit v1.2.3