From bc6d543210a9df6f578229c6050371ced665fd69 Mon Sep 17 00:00:00 2001 From: Mike Buland Date: Fri, 8 Apr 2011 00:51:24 +0000 Subject: Rearranged the API a bit. --- src/utfstring.h | 88 ++++++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 75 insertions(+), 13 deletions(-) diff --git a/src/utfstring.h b/src/utfstring.h index be3e6ad..477e272 100644 --- a/src/utfstring.h +++ b/src/utfstring.h @@ -26,6 +26,33 @@ namespace Bu */ typedef uint32_t UtfChar; + /** + * A unicode string. This class represents a string of unicode code points. + * Every character in unicode can be represented with 21 bits, but we don't + * have a datatype that's 24 bits long, so we return all code points as a + * 32 bit unsigned value represented by Bu::UtfChar. However, the UtfString + * class, for efficiency purposes doesn't store 32 bit values internally. + * It represents all code points in the native utf16 encodeng. This means + * that it may be very difficult to quickly determine the length of a + * UtfString in code points. Unlike many Unicode handling systems, this + * one actually works with complete code points. When using this class you + * don't ever have to know about the inner workings of the different + * encoding schemes. All of the data is dealt with as whole code points. + * + * As an aside, this means that when encoding a UtfString to a Utf16 + * encoding that matches your archetecture this operation will be very + * fast since it will effectively be a raw dump of the internal data + * structures. However, it is highly reccomended that you DO NOT use the + * little endian encodings if you can possibly avoid it. They are not + * reccomended by the Unicode Consortium and are mainly supported as a + * means of communicating with other systems that encode their data + * incorrectly. That said, whenever UtfString encodes the contained string + * it always includes a BOM at the begining (the byte order marker) so that + * proper byte order can be easily determined by the program reading the + * data. + * + *@todo Investigate http://www.unicode.org/reports/tr6/ for compression. + */ class UtfString { public: @@ -73,9 +100,56 @@ namespace Bu int iCodePos; }; + /** + * Append a UtfChar (A unicode code point) to the string. This can be + * any valid code point, and is just the value of the code point, no + * encoding necessary. + */ void append( UtfChar ch ); + /** + * Set the value of the entire string based on the given input and + * encoding. The default encoding is Utf8, which is compatible with + * 7-bit ascii, so it's a great choice for setting UtfStrings from + * string literals in code. + */ void set( const Bu::String &sInput, Encoding eEnc=Utf8 ); + + /** + * This encodes the UtfString in the given encoding and outputs it to + * the provided stream. all Utf16 and Utf32 encodings will have the + * correct BOM (byte order marker) at the begining. + */ + void write( Bu::Stream &sOut, Encoding eEnc=Utf8 ); + + /** + * This encodes the UtfString in the given encoding and returns it as + * a binary Bu::String. Like write, this also includes the proper BOM + * at the begining. + */ + Bu::String get( Encoding eEnc=Utf8 ); + + void debug(); + + /** + * This may or may not stick around, given an index, this returns a + * codepoint, however there isn't necesarilly a 1:1 ratio between + * indexes and code points. + */ + UtfChar get( int iIndex ); + + /** + * This is what to use if you want to iterate through a section of the + * UtfString and you want to use a numerical index. In most cases it + * will be much easier to use an iterator, though. Given an index this + * will return the codepoint at that position and increment iIndex an + * appropriate amount for it to point to the next code point. + */ + UtfChar nextChar( int &iIndex ); + + private: + void append16( uint16_t i ) { aData.append( i ); } + void setUtf8( const Bu::String &sInput ); void setUtf16( const Bu::String &sInput ); void setUtf16be( const Bu::String &sInput ); @@ -83,25 +157,13 @@ namespace Bu void setUtf32( const Bu::String &sInput ); void setUtf32be( const Bu::String &sInput ); void setUtf32le( const Bu::String &sInput ); - - void write( Bu::Stream &sOut, Encoding eEnc=Utf8 ); + void writeUtf8( Bu::Stream &sOut ); void writeUtf16be( Bu::Stream &sOut ); void writeUtf16le( Bu::Stream &sOut ); void writeUtf32be( Bu::Stream &sOut ); void writeUtf32le( Bu::Stream &sOut ); - Bu::String to( Encoding eEnc=Utf8 ); - Bu::String toUtf8(); - - void debug(); - - UtfChar get( int iIndex ); - UtfChar nextChar( int &iIndex ); - - private: - void append16( uint16_t i ) { aData.append( i ); } - private: Bu::Array aData; int iRawLen; -- cgit v1.2.3