From 27aecbc60be6c80ce221f29c01f743de714faa63 Mon Sep 17 00:00:00 2001 From: Mike Buland Date: Thu, 7 Apr 2011 05:44:42 +0000 Subject: Pretty sure all utf encoders and decoders are complete and tested. --- src/tests/utf.cpp | 42 ++++++++++++++++ src/utfstring.cpp | 143 +++++++++++++++++++++++++++++++++++++++++++++++++----- src/utfstring.h | 2 - 3 files changed, 174 insertions(+), 13 deletions(-) (limited to 'src') diff --git a/src/tests/utf.cpp b/src/tests/utf.cpp index 9e075e2..01bac7e 100644 --- a/src/tests/utf.cpp +++ b/src/tests/utf.cpp @@ -4,6 +4,46 @@ int main( int argc, char *argv[] ) { + Bu::File fIn("utf8.in", Bu::File::Read ); + Bu::String sUtf8; + char buf[4096]; + while( !fIn.isEos() ) + { + int iAmnt = fIn.read( buf, 4096 ); + sUtf8.append( buf, iAmnt ); + } + Bu::UtfString us( sUtf8, Bu::UtfString::Utf8 ); + us.debug(); + { + Bu::File fOut("utf8.out", Bu::File::WriteNew ); + us.write( fOut, Bu::UtfString::Utf8 ); + } + { + Bu::File fOut("utf16.out", Bu::File::WriteNew ); + us.write( fOut, Bu::UtfString::Utf16 ); + } + { + Bu::File fOut("utf16le.out", Bu::File::WriteNew ); + us.write( fOut, Bu::UtfString::Utf16le ); + } + { + Bu::File fOut("utf16be.out", Bu::File::WriteNew ); + us.write( fOut, Bu::UtfString::Utf16be ); + } + { + Bu::File fOut("utf32.out", Bu::File::WriteNew ); + us.write( fOut, Bu::UtfString::Utf32 ); + } + { + Bu::File fOut("utf32le.out", Bu::File::WriteNew ); + us.write( fOut, Bu::UtfString::Utf32le ); + } + { + Bu::File fOut("utf32be.out", Bu::File::WriteNew ); + us.write( fOut, Bu::UtfString::Utf32be ); + } + + /* argc--, argv++; for( char **sFile = argv; *sFile; sFile++ ) @@ -17,7 +57,9 @@ int main( int argc, char *argv[] ) sUtf8.append( buf, iAmnt ); } Bu::UtfString us( sUtf8, Bu::UtfString::Utf16 ); + us.debug(); } + */ } diff --git a/src/utfstring.cpp b/src/utfstring.cpp index c9da52f..3f57618 100644 --- a/src/utfstring.cpp +++ b/src/utfstring.cpp @@ -259,8 +259,8 @@ void Bu::UtfString::write( Bu::Stream &sOut, Encoding eEnc ) break; case Utf16: - writeUtf16( sOut ); - break; +// writeUtf16( sOut ); +// break; case Utf16be: writeUtf16be( sOut ); @@ -271,8 +271,8 @@ void Bu::UtfString::write( Bu::Stream &sOut, Encoding eEnc ) break; case Utf32: - writeUtf32( sOut ); - break; +// writeUtf32( sOut ); +// break; case Utf32be: writeUtf32be( sOut ); @@ -300,30 +300,151 @@ void Bu::UtfString::write( Bu::Stream &sOut, Encoding eEnc ) void Bu::UtfString::writeUtf8( Bu::Stream &sOut ) { + int iPos = 0; + while( iPos < aData.getSize() ) + { + uint8_t uByte; + Bu::UtfChar chr = nextChar( iPos ); + if( chr >= 0x010000 ) + { + // Four bytes + // 111 111111 111111 111111 + uByte = (chr>>18)|0xF0; + sOut.write( &uByte, 1 ); + uByte = (chr>>12)&0x3F|0x80; + sOut.write( &uByte, 1 ); + uByte = (chr>>6)&0x3F|0x80; + sOut.write( &uByte, 1 ); + uByte = (chr&0x3F)|0x80; + sOut.write( &uByte, 1 ); + } + else if( chr >= 0x800 ) + { + // Three bytes + // 1111 111111 111111 + uByte = (chr>>12)|0xE0; + sOut.write( &uByte, 1 ); + uByte = (chr>>6)&0x3F|0x80; + sOut.write( &uByte, 1 ); + uByte = (chr&0x3F)|0x80; + sOut.write( &uByte, 1 ); + } + else if( chr >= 0x80 ) + { + // Two bytes + // 11111 111111 + uByte = (chr>>6)|0xC0; + sOut.write( &uByte, 1 ); + uByte = (chr&0x3F)|0x80; + sOut.write( &uByte, 1 ); + } + else + { + // One byte + uByte = chr; + sOut.write( &uByte, 1 ); + } + } } - +/* void Bu::UtfString::writeUtf16( Bu::Stream &sOut ) { } - +*/ void Bu::UtfString::writeUtf16be( Bu::Stream &sOut ) { +#if BYTE_ORDER == BIG_ENDIAN + uint16_t iTmp = 0xFEFF; // Byte Order Marker + sOut.write( &iTmp, 2 ); + for( Array::iterator i = aData.begin(); i; i++ ) + { + iTmp = *i; + sOut.write( &iTmp, 2 ); + } +#else + uint16_t iTmp = 0xFEFF; // Byte Order Marker + iTmp = (iTmp>>8) | (iTmp<<8); + sOut.write( &iTmp, 2 ); + for( Array::iterator i = aData.begin(); i; i++ ) + { + iTmp = *i; + iTmp = (iTmp>>8) | (iTmp<<8); + sOut.write( &iTmp, 2 ); + } +#endif } void Bu::UtfString::writeUtf16le( Bu::Stream &sOut ) { -} - -void Bu::UtfString::writeUtf32( Bu::Stream &sOut ) -{ +#if BYTE_ORDER == LITTLE_ENDIAN + uint16_t iTmp = 0xFEFF; // Byte Order Marker + sOut.write( &iTmp, 2 ); + for( Array::iterator i = aData.begin(); i; i++ ) + { + iTmp = *i; + sOut.write( &iTmp, 2 ); + } +#else + uint16_t iTmp = 0xFEFF; // Byte Order Marker + iTmp = (iTmp>>8) | (iTmp<<8); + sOut.write( &iTmp, 2 ); + for( Array::iterator i = aData.begin(); i; i++ ) + { + iTmp = *i; + iTmp = (iTmp>>8) | (iTmp<<8); + sOut.write( &iTmp, 2 ); + } +#endif } void Bu::UtfString::writeUtf32be( Bu::Stream &sOut ) { +#if BYTE_ORDER == BIG_ENDIAN + uint32_t iTmp = 0xFEFF; // Byte Order Marker + sOut.write( &iTmp, 4 ); + int i = 0; + while( i < aData.getSize() ) + { + iTmp = nextChar( i ); + sOut.write( &iTmp, 4 ); + } +#else + uint32_t iTmp = 0xFEFF; // Byte Order Marker + iTmp = (iTmp>>24)|(iTmp<<24)|((iTmp&0xff0000)>>8)|((iTmp&0xff00)<<8); + sOut.write( &iTmp, 4 ); + int i = 0; + while( i < aData.getSize() ) + { + iTmp = nextChar( i ); + iTmp = (iTmp>>24)|(iTmp<<24)|((iTmp&0xff0000)>>8)|((iTmp&0xff00)<<8); + sOut.write( &iTmp, 4 ); + } +#endif } void Bu::UtfString::writeUtf32le( Bu::Stream &sOut ) { +#if BYTE_ORDER == LITTLE_ENDIAN + uint32_t iTmp = 0xFEFF; // Byte Order Marker + sOut.write( &iTmp, 4 ); + int i = 0; + while( i < aData.getSize() ) + { + iTmp = nextChar( i ); + sOut.write( &iTmp, 4 ); + } +#else + uint32_t iTmp = 0xFEFF; // Byte Order Marker + iTmp = (iTmp>>24)|(iTmp<<24)|((iTmp&0xff0000)>>8)|((iTmp&0xff00)<<8); + sOut.write( &iTmp, 4 ); + int i = 0; + while( i < aData.getSize() ) + { + iTmp = nextChar( i ); + iTmp = (iTmp>>24)|(iTmp<<24)|((iTmp&0xff0000)>>8)|((iTmp&0xff00)<<8); + sOut.write( &iTmp, 4 ); + } +#endif } Bu::UtfChar Bu::UtfString::get( int iIndex ) @@ -362,7 +483,7 @@ void Bu::UtfString::debug() { if( i > 0 ) sio << ", "; - sio << "0x" << Fmt::hex() << get( i ); + sio << "0x" << Fmt::hex() << nextChar( i ); } sio << sio.nl; } diff --git a/src/utfstring.h b/src/utfstring.h index 8448ea4..be3e6ad 100644 --- a/src/utfstring.h +++ b/src/utfstring.h @@ -86,10 +86,8 @@ namespace Bu void write( Bu::Stream &sOut, Encoding eEnc=Utf8 ); void writeUtf8( Bu::Stream &sOut ); - void writeUtf16( Bu::Stream &sOut ); void writeUtf16be( Bu::Stream &sOut ); void writeUtf16le( Bu::Stream &sOut ); - void writeUtf32( Bu::Stream &sOut ); void writeUtf32be( Bu::Stream &sOut ); void writeUtf32le( Bu::Stream &sOut ); -- cgit v1.2.3