From 88004d87d513dcba767b1dae1e5199a89b22ce36 Mon Sep 17 00:00:00 2001 From: Mike Buland Date: Tue, 22 Mar 2011 19:25:42 +0000 Subject: We now have a UTF-8 test parser, I'm going to move it into a functor, I think. --- src/tests/utf.cpp | 22 ++++++++++++++++++++++ src/utfstring.cpp | 54 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ src/utfstring.h | 4 ++++ 3 files changed, 80 insertions(+) create mode 100644 src/tests/utf.cpp diff --git a/src/tests/utf.cpp b/src/tests/utf.cpp new file mode 100644 index 0000000..59d49c6 --- /dev/null +++ b/src/tests/utf.cpp @@ -0,0 +1,22 @@ +#include +#include +#include + +int main( int argc, char *argv[] ) +{ + argc--, argv++; + + for( char **sFile = argv; *sFile; sFile++ ) + { + Bu::File fIn( *sFile, Bu::File::Read ); + Bu::String sUtf8; + char buf[4096]; + while( !fIn.isEos() ) + { + int iAmnt = fIn.read( buf, 4096 ); + sUtf8.append( buf, iAmnt ); + } + Bu::UtfString::debugUtf8( sUtf8 ); + } +} + diff --git a/src/utfstring.cpp b/src/utfstring.cpp index eb23713..0e2060b 100644 --- a/src/utfstring.cpp +++ b/src/utfstring.cpp @@ -7,6 +7,8 @@ #include "bu/utfstring.h" +#include "bu/string.h" + Bu::UtfString::UtfString() { } @@ -15,3 +17,55 @@ Bu::UtfString::~UtfString() { } +#include "bu/sio.h" +using Bu::sio; + +void Bu::UtfString::debugUtf8( const Bu::String &sUtf8 ) +{ + static uint8_t lmask[8] = { + 0x00, + 0x01, + 0x03, + 0x07, + 0x0f, + 0x1f, + 0x3f, + 0x7f + }; + for( Bu::String::const_iterator i = sUtf8.begin(); i; i++ ) + { + if( i != sUtf8.begin() ) + sio << ", "; + if( ((int)(uint8_t)*i)&0x80 ) + { +// sio << "Flag byte: " << Bu::Fmt().radix(2).width(8).fill('0') +// << (int)(uint8_t)*i << sio.nl; + int iBytes = 1; + for(; (((uint8_t)(*i))<= 1; iBytes-- ) + { +// sio << "iBytes = " << iBytes << ", shift = " << (6*(iBytes-1)) +// << sio.nl; +// sio << "next: " << Bu::Fmt().radix(2).width(8).fill('0') +// << (int)(uint8_t)*i << sio.nl +// << "mask: " << Bu::Fmt().radix(2).width(8).fill('0') +// << (int)lmask[6] << sio.nl; + i++; + uPt |= ((*i)&lmask[6])<<(6*(iBytes-1)); + } + sio << uPt; +// sio << " (" << Bu::Fmt( 8, 2 ).fill('0') +// << uPt << ")"; + } + else + { + sio << (int)((uint8_t)*i); + } + } + sio << sio.nl; +} + diff --git a/src/utfstring.h b/src/utfstring.h index 56e544e..3bdf51c 100644 --- a/src/utfstring.h +++ b/src/utfstring.h @@ -12,6 +12,8 @@ namespace Bu { + class String; + class UtfString { public: @@ -20,6 +22,8 @@ namespace Bu typedef uint32_t point; + static void debugUtf8( const Bu::String &sUtf8 ); + private: // typedef BasicString RawString; // RawString rsStore; -- cgit v1.2.3