From ec05778d5718a7912e506764d443a78d6a6179e3 Mon Sep 17 00:00:00 2001 From: Mike Buland Date: Mon, 5 Nov 2012 22:41:51 +0000 Subject: Converted tabs to spaces with tabconv. --- src/unstable/utfstring.h | 482 +++++++++++++++++++++++------------------------ 1 file changed, 241 insertions(+), 241 deletions(-) (limited to 'src/unstable/utfstring.h') diff --git a/src/unstable/utfstring.h b/src/unstable/utfstring.h index 1bd4cce..560faae 100644 --- a/src/unstable/utfstring.h +++ b/src/unstable/utfstring.h @@ -13,247 +13,247 @@ namespace Bu { - class String; - class Stream; - - /** - * UtfChar isn't actually a character, unicode specifies "code points" not - * characters. The main reason for this is that not all code points define - * usable characters. Some control text directionality, some apply - * properties to other code points which are characters. However, most of - * these distinctions are only important when implementing displays that - * comply with the Unicode standard fully. - */ - typedef uint32_t UtfChar; - - /** - * A unicode string. This class represents a string of unicode code points. - * Every character in unicode can be represented with 21 bits, but we don't - * have a datatype that's 24 bits long, so we return all code points as a - * 32 bit unsigned value represented by Bu::UtfChar. However, the UtfString - * class, for efficiency purposes doesn't store 32 bit values internally. - * It represents all code points in the native utf16 encodeng. This means - * that it may be very difficult to quickly determine the length of a - * UtfString in code points. Unlike many Unicode handling systems, this - * one actually works with complete code points. When using this class you - * don't ever have to know about the inner workings of the different - * encoding schemes. All of the data is dealt with as whole code points. - * - * As an aside, this means that when encoding a UtfString to a Utf16 - * encoding that matches your archetecture this operation will be very - * fast since it will effectively be a raw dump of the internal data - * structures. However, it is highly reccomended that you DO NOT use the - * little endian encodings if you can possibly avoid it. They are not - * reccomended by the Unicode Consortium and are mainly supported as a - * means of communicating with other systems that encode their data - * incorrectly. That said, whenever UtfString encodes the contained string - * it always includes a BOM at the begining (the byte order marker) so that - * proper byte order can be easily determined by the program reading the - * data. - * - *@todo Investigate http://www.unicode.org/reports/tr6/ for compression. - */ - class UtfString - { - public: - enum Encoding - { - Utf8, - Utf16, - Utf16be, - Utf16le, - Utf32, - Utf32be, - Utf32le, - Ucs2, - Ucs4, - GuessEncoding - }; - - UtfString(); - UtfString( const Bu::String &sInput, Encoding eEnc=Utf8 ); - UtfString( const char *sInput, Encoding eEnc=Utf8 ); - virtual ~UtfString(); - - class iterator - { - friend class UtfString; - private: - iterator( UtfString *pSrc, int iCodePos ) : - pSrc( pSrc ), iCodePos( iCodePos ) - { - } - - public: - iterator() : - pSrc( NULL ), iCodePos( 0 ) - { - } - - UtfChar operator*() - { - if( !pSrc ) - throw Bu::ExceptionBase("invalid UtfString::iterator dereferenced."); - return pSrc->get( iCodePos ); - } - - iterator operator++() - { - pSrc->nextChar( iCodePos ); - return *this; - } - - iterator operator++( int ) - { - pSrc->nextChar( iCodePos ); - return *this; - } - - operator bool() const - { - return iCodePos < pSrc->aData.getSize(); - } - - private: - UtfString *pSrc; - int iCodePos; - }; - - class const_iterator - { - friend class UtfString; - private: - const_iterator( const UtfString *pSrc, int iCodePos ) : - pSrc( pSrc ), iCodePos( iCodePos ) - { - } - - public: - const_iterator() : - pSrc( NULL ), iCodePos( 0 ) - { - } - - UtfChar operator*() - { - if( !pSrc ) - throw Bu::ExceptionBase("invalid UtfString::iterator dereferenced."); - return pSrc->get( iCodePos ); - } - - const_iterator operator++() - { - pSrc->nextChar( iCodePos ); - return *this; - } - - const_iterator operator++( int ) - { - pSrc->nextChar( iCodePos ); - return *this; - } - - operator bool() const - { - return iCodePos < pSrc->aData.getSize(); - } - - private: - const UtfString *pSrc; - int iCodePos; - }; - - iterator begin(); - const_iterator begin() const; - - /** - * Append a UtfChar (A unicode code point) to the string. This can be - * any valid code point, and is just the value of the code point, no - * encoding necessary. - */ - void append( UtfChar ch ); - - void append( const UtfString &rSrc ); - - /** - * Set the value of the entire string based on the given input and - * encoding. The default encoding is Utf8, which is compatible with - * 7-bit ascii, so it's a great choice for setting UtfStrings from - * string literals in code. - */ - void set( const Bu::String &sInput, Encoding eEnc=Utf8 ); - - /** - * This encodes the UtfString in the given encoding and outputs it to - * the provided stream. all Utf16 and Utf32 encodings will have the - * correct BOM (byte order marker) at the begining. - */ - void write( Bu::Stream &sOut, Encoding eEnc=Utf8 ) const; - - /** - * This encodes the UtfString in the given encoding and returns it as - * a binary Bu::String. Like write, this also includes the proper BOM - * at the begining. - */ - Bu::String get( Encoding eEnc=Utf8 ) const; - - void debug() const; - - /** - * This may or may not stick around, given an index, this returns a - * codepoint, however there isn't necesarilly a 1:1 ratio between - * indexes and code points. - */ - UtfChar get( int iIndex ) const; - - /** - * This is what to use if you want to iterate through a section of the - * UtfString and you want to use a numerical index. In most cases it - * will be much easier to use an iterator, though. Given an index this - * will return the codepoint at that position and increment iIndex an - * appropriate amount for it to point to the next code point. - */ - UtfChar nextChar( int &iIndex ) const; - - bool operator==( const Bu::UtfString &rhs ) const; - UtfString &operator+=( const Bu::UtfString &rhs ); - UtfString &operator+=( const UtfChar &rhs ); - - private: - void append16( uint16_t i ) { aData.append( i ); } - - void setUtf8( const Bu::String &sInput ); - void setUtf16( const Bu::String &sInput ); - void setUtf16be( const Bu::String &sInput ); - void setUtf16le( const Bu::String &sInput ); - void setUtf32( const Bu::String &sInput ); - void setUtf32be( const Bu::String &sInput ); - void setUtf32le( const Bu::String &sInput ); - - void writeUtf8( Bu::Stream &sOut ) const; - void writeUtf16be( Bu::Stream &sOut ) const; - void writeUtf16le( Bu::Stream &sOut ) const; - void writeUtf32be( Bu::Stream &sOut ) const; - void writeUtf32le( Bu::Stream &sOut ) const; - - private: - Bu::Array aData; - int iRawLen; - int iCharLen; - }; - - // - // Hash support - // - template - uint32_t __calcHashCode( const T &k ); - - template - bool __cmpHashKeys( const T &a, const T &b ); - - template<> uint32_t __calcHashCode( const UtfString &k ); - template<> bool __cmpHashKeys( - const UtfString &a, const UtfString &b ); + class String; + class Stream; + + /** + * UtfChar isn't actually a character, unicode specifies "code points" not + * characters. The main reason for this is that not all code points define + * usable characters. Some control text directionality, some apply + * properties to other code points which are characters. However, most of + * these distinctions are only important when implementing displays that + * comply with the Unicode standard fully. + */ + typedef uint32_t UtfChar; + + /** + * A unicode string. This class represents a string of unicode code points. + * Every character in unicode can be represented with 21 bits, but we don't + * have a datatype that's 24 bits long, so we return all code points as a + * 32 bit unsigned value represented by Bu::UtfChar. However, the UtfString + * class, for efficiency purposes doesn't store 32 bit values internally. + * It represents all code points in the native utf16 encodeng. This means + * that it may be very difficult to quickly determine the length of a + * UtfString in code points. Unlike many Unicode handling systems, this + * one actually works with complete code points. When using this class you + * don't ever have to know about the inner workings of the different + * encoding schemes. All of the data is dealt with as whole code points. + * + * As an aside, this means that when encoding a UtfString to a Utf16 + * encoding that matches your archetecture this operation will be very + * fast since it will effectively be a raw dump of the internal data + * structures. However, it is highly reccomended that you DO NOT use the + * little endian encodings if you can possibly avoid it. They are not + * reccomended by the Unicode Consortium and are mainly supported as a + * means of communicating with other systems that encode their data + * incorrectly. That said, whenever UtfString encodes the contained string + * it always includes a BOM at the begining (the byte order marker) so that + * proper byte order can be easily determined by the program reading the + * data. + * + *@todo Investigate http://www.unicode.org/reports/tr6/ for compression. + */ + class UtfString + { + public: + enum Encoding + { + Utf8, + Utf16, + Utf16be, + Utf16le, + Utf32, + Utf32be, + Utf32le, + Ucs2, + Ucs4, + GuessEncoding + }; + + UtfString(); + UtfString( const Bu::String &sInput, Encoding eEnc=Utf8 ); + UtfString( const char *sInput, Encoding eEnc=Utf8 ); + virtual ~UtfString(); + + class iterator + { + friend class UtfString; + private: + iterator( UtfString *pSrc, int iCodePos ) : + pSrc( pSrc ), iCodePos( iCodePos ) + { + } + + public: + iterator() : + pSrc( NULL ), iCodePos( 0 ) + { + } + + UtfChar operator*() + { + if( !pSrc ) + throw Bu::ExceptionBase("invalid UtfString::iterator dereferenced."); + return pSrc->get( iCodePos ); + } + + iterator operator++() + { + pSrc->nextChar( iCodePos ); + return *this; + } + + iterator operator++( int ) + { + pSrc->nextChar( iCodePos ); + return *this; + } + + operator bool() const + { + return iCodePos < pSrc->aData.getSize(); + } + + private: + UtfString *pSrc; + int iCodePos; + }; + + class const_iterator + { + friend class UtfString; + private: + const_iterator( const UtfString *pSrc, int iCodePos ) : + pSrc( pSrc ), iCodePos( iCodePos ) + { + } + + public: + const_iterator() : + pSrc( NULL ), iCodePos( 0 ) + { + } + + UtfChar operator*() + { + if( !pSrc ) + throw Bu::ExceptionBase("invalid UtfString::iterator dereferenced."); + return pSrc->get( iCodePos ); + } + + const_iterator operator++() + { + pSrc->nextChar( iCodePos ); + return *this; + } + + const_iterator operator++( int ) + { + pSrc->nextChar( iCodePos ); + return *this; + } + + operator bool() const + { + return iCodePos < pSrc->aData.getSize(); + } + + private: + const UtfString *pSrc; + int iCodePos; + }; + + iterator begin(); + const_iterator begin() const; + + /** + * Append a UtfChar (A unicode code point) to the string. This can be + * any valid code point, and is just the value of the code point, no + * encoding necessary. + */ + void append( UtfChar ch ); + + void append( const UtfString &rSrc ); + + /** + * Set the value of the entire string based on the given input and + * encoding. The default encoding is Utf8, which is compatible with + * 7-bit ascii, so it's a great choice for setting UtfStrings from + * string literals in code. + */ + void set( const Bu::String &sInput, Encoding eEnc=Utf8 ); + + /** + * This encodes the UtfString in the given encoding and outputs it to + * the provided stream. all Utf16 and Utf32 encodings will have the + * correct BOM (byte order marker) at the begining. + */ + void write( Bu::Stream &sOut, Encoding eEnc=Utf8 ) const; + + /** + * This encodes the UtfString in the given encoding and returns it as + * a binary Bu::String. Like write, this also includes the proper BOM + * at the begining. + */ + Bu::String get( Encoding eEnc=Utf8 ) const; + + void debug() const; + + /** + * This may or may not stick around, given an index, this returns a + * codepoint, however there isn't necesarilly a 1:1 ratio between + * indexes and code points. + */ + UtfChar get( int iIndex ) const; + + /** + * This is what to use if you want to iterate through a section of the + * UtfString and you want to use a numerical index. In most cases it + * will be much easier to use an iterator, though. Given an index this + * will return the codepoint at that position and increment iIndex an + * appropriate amount for it to point to the next code point. + */ + UtfChar nextChar( int &iIndex ) const; + + bool operator==( const Bu::UtfString &rhs ) const; + UtfString &operator+=( const Bu::UtfString &rhs ); + UtfString &operator+=( const UtfChar &rhs ); + + private: + void append16( uint16_t i ) { aData.append( i ); } + + void setUtf8( const Bu::String &sInput ); + void setUtf16( const Bu::String &sInput ); + void setUtf16be( const Bu::String &sInput ); + void setUtf16le( const Bu::String &sInput ); + void setUtf32( const Bu::String &sInput ); + void setUtf32be( const Bu::String &sInput ); + void setUtf32le( const Bu::String &sInput ); + + void writeUtf8( Bu::Stream &sOut ) const; + void writeUtf16be( Bu::Stream &sOut ) const; + void writeUtf16le( Bu::Stream &sOut ) const; + void writeUtf32be( Bu::Stream &sOut ) const; + void writeUtf32le( Bu::Stream &sOut ) const; + + private: + Bu::Array aData; + int iRawLen; + int iCharLen; + }; + + // + // Hash support + // + template + uint32_t __calcHashCode( const T &k ); + + template + bool __cmpHashKeys( const T &a, const T &b ); + + template<> uint32_t __calcHashCode( const UtfString &k ); + template<> bool __cmpHashKeys( + const UtfString &a, const UtfString &b ); }; #endif -- cgit v1.2.3