From abbf45c1da7f3e3a542e6c6339a1bab31283f22e Mon Sep 17 00:00:00 2001 From: Mike Buland Date: Mon, 4 Apr 2011 07:22:10 +0000 Subject: I made some awesome progress on the UtfString system, it stores in native utf16 encoding to make things easier (little endian in our case). It can currently read utf8 and utf16be, but not BOM. It will give you full unicode code points instead of the raw utf16 values, which is pretty slick. --- src/utfstring.cpp | 143 +++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 141 insertions(+), 2 deletions(-) (limited to 'src/utfstring.cpp') diff --git a/src/utfstring.cpp b/src/utfstring.cpp index 0e2060b..bb0a011 100644 --- a/src/utfstring.cpp +++ b/src/utfstring.cpp @@ -9,17 +9,156 @@ #include "bu/string.h" +#include + Bu::UtfString::UtfString() { } +Bu::UtfString::UtfString( const Bu::String &sInput, Encoding eEnc ) +{ + set( sInput, eEnc ); +} + Bu::UtfString::~UtfString() { } +void Bu::UtfString::set( const Bu::String &sInput, Encoding eEnc ) +{ + switch( eEnc ) + { + case Utf8: + setUtf8( sInput ); + break; + + case Utf16: + case Utf16be: + setUtf16( sInput ); + break; + + case Utf16le: + throw Bu::ExceptionBase("Utf16le not supported yet."); + break; + + case Utf32: + throw Bu::ExceptionBase("Utf32 not supported yet."); + break; + + case Ucs16: + throw Bu::ExceptionBase("Ucs16 not supported yet."); + break; + + case GuessEncoding: + throw Bu::ExceptionBase("Guessing mode not supported yet."); + break; + } +} + +void Bu::UtfString::append( UtfChar ch ) +{ + if( ch >= 0x10000 ) + { + ch -= 0x10000; + append16( ((ch>>10)&0x3FF)| 0xD800u ); + append16( (ch&0x3FF)| 0xDC00u ); + } + else + { + append16( (uint16_t)(ch) ); + } +} + +void Bu::UtfString::setUtf8( const Bu::String &sInput ) +{ + static uint8_t lmask[8] = { + 0x00, + 0x01, + 0x03, + 0x07, + 0x0f, + 0x1f, + 0x3f, + 0x7f + }; + for( Bu::String::const_iterator i = sInput.begin(); i; i++ ) + { + if( ((int)(uint8_t)*i)&0x80 ) + { + int iBytes = 1; + for(; (((uint8_t)(*i))<= 1; iBytes-- ) + { + i++; + uPt |= ((*i)&lmask[6])<<(6*(iBytes-1)); + } + append( uPt ); + } + else + { + append( (Bu::UtfChar)(*i) ); + } + } +} + +void Bu::UtfString::setUtf16( const Bu::String &sInput ) +{ + uint16_t hi, lo; + for( Bu::String::const_iterator i = sInput.begin(); i; i++ ) + { + hi = (((uint8_t)*i)<<8) | ((uint8_t)*(++i)); + append16( hi ); + if( (hi&0xD800u) == 0xD800u ) + { + lo = (((uint8_t)*(++i))<<8) | ((uint8_t)*(++i)); + append16( lo ); + } + } +} + #include "bu/sio.h" using Bu::sio; +Bu::UtfChar Bu::UtfString::get( int iIndex ) +{ + Bu::UtfChar i = aData[iIndex]; + switch( i&0xFC00 ) + { + case 0xD800: + sio << "(hi) "; + return (((i&0x3FF)<<10) | ((aData[iIndex+1]&0x3FF)))+0x10000; + + case 0xDC00: + sio << "(lo) "; + return 0; + + default: + sio << "(--) "; + return i&0xFC00; + } +} + +void Bu::UtfString::debug() +{ + sio << "Raw Utf16: "; + for( int i = 0; i < aData.getSize(); i++ ) + { + if( i > 0 ) + sio << ", "; + sio << "0x" << Fmt::hex() << aData[i]; + } + sio << sio.nl; + sio << "Code Points: "; + for( int i = 0; i < aData.getSize(); i++ ) + { + if( i > 0 ) + sio << ", "; + sio << "0x" << Fmt::hex() << get( i ); + } + sio << sio.nl; +} +/* void Bu::UtfString::debugUtf8( const Bu::String &sUtf8 ) { static uint8_t lmask[8] = { @@ -43,7 +182,7 @@ void Bu::UtfString::debugUtf8( const Bu::String &sUtf8 ) int iBytes = 1; for(; (((uint8_t)(*i))<= 1; iBytes-- ) @@ -68,4 +207,4 @@ void Bu::UtfString::debugUtf8( const Bu::String &sUtf8 ) } sio << sio.nl; } - +*/ -- cgit v1.2.3