diff options
Diffstat (limited to 'src/unstable/utfstring.h')
| -rw-r--r-- | src/unstable/utfstring.h | 174 |
1 files changed, 174 insertions, 0 deletions
diff --git a/src/unstable/utfstring.h b/src/unstable/utfstring.h new file mode 100644 index 0000000..477e272 --- /dev/null +++ b/src/unstable/utfstring.h | |||
| @@ -0,0 +1,174 @@ | |||
| 1 | /* | ||
| 2 | * Copyright (C) 2007-2011 Xagasoft, All rights reserved. | ||
| 3 | * | ||
| 4 | * This file is part of the libbu++ library and is released under the | ||
| 5 | * terms of the license contained in the file LICENSE. | ||
| 6 | */ | ||
| 7 | |||
| 8 | #ifndef BU_UTF_STRING_H | ||
| 9 | #define BU_UTF_STRING_H | ||
| 10 | |||
| 11 | #include <stdint.h> | ||
| 12 | #include "bu/array.h" | ||
| 13 | |||
| 14 | namespace Bu | ||
| 15 | { | ||
| 16 | class String; | ||
| 17 | class Stream; | ||
| 18 | |||
| 19 | /** | ||
| 20 | * UtfChar isn't actually a character, unicode specifies "code points" not | ||
| 21 | * characters. The main reason for this is that not all code points define | ||
| 22 | * usable characters. Some control text directionality, some apply | ||
| 23 | * properties to other code points which are characters. However, most of | ||
| 24 | * these distinctions are only important when implementing displays that | ||
| 25 | * comply with the Unicode standard fully. | ||
| 26 | */ | ||
| 27 | typedef uint32_t UtfChar; | ||
| 28 | |||
| 29 | /** | ||
| 30 | * A unicode string. This class represents a string of unicode code points. | ||
| 31 | * Every character in unicode can be represented with 21 bits, but we don't | ||
| 32 | * have a datatype that's 24 bits long, so we return all code points as a | ||
| 33 | * 32 bit unsigned value represented by Bu::UtfChar. However, the UtfString | ||
| 34 | * class, for efficiency purposes doesn't store 32 bit values internally. | ||
| 35 | * It represents all code points in the native utf16 encodeng. This means | ||
| 36 | * that it may be very difficult to quickly determine the length of a | ||
| 37 | * UtfString in code points. Unlike many Unicode handling systems, this | ||
| 38 | * one actually works with complete code points. When using this class you | ||
| 39 | * don't ever have to know about the inner workings of the different | ||
| 40 | * encoding schemes. All of the data is dealt with as whole code points. | ||
| 41 | * | ||
| 42 | * As an aside, this means that when encoding a UtfString to a Utf16 | ||
| 43 | * encoding that matches your archetecture this operation will be very | ||
| 44 | * fast since it will effectively be a raw dump of the internal data | ||
| 45 | * structures. However, it is highly reccomended that you DO NOT use the | ||
| 46 | * little endian encodings if you can possibly avoid it. They are not | ||
| 47 | * reccomended by the Unicode Consortium and are mainly supported as a | ||
| 48 | * means of communicating with other systems that encode their data | ||
| 49 | * incorrectly. That said, whenever UtfString encodes the contained string | ||
| 50 | * it always includes a BOM at the begining (the byte order marker) so that | ||
| 51 | * proper byte order can be easily determined by the program reading the | ||
| 52 | * data. | ||
| 53 | * | ||
| 54 | *@todo Investigate http://www.unicode.org/reports/tr6/ for compression. | ||
| 55 | */ | ||
| 56 | class UtfString | ||
| 57 | { | ||
| 58 | public: | ||
| 59 | enum Encoding | ||
| 60 | { | ||
| 61 | Utf8, | ||
| 62 | Utf16, | ||
| 63 | Utf16be, | ||
| 64 | Utf16le, | ||
| 65 | Utf32, | ||
| 66 | Utf32be, | ||
| 67 | Utf32le, | ||
| 68 | Ucs2, | ||
| 69 | Ucs4, | ||
| 70 | GuessEncoding | ||
| 71 | }; | ||
| 72 | |||
| 73 | UtfString(); | ||
| 74 | UtfString( const Bu::String &sInput, Encoding eEnc=Utf8 ); | ||
| 75 | virtual ~UtfString(); | ||
| 76 | |||
| 77 | class iterator | ||
| 78 | { | ||
| 79 | private: | ||
| 80 | iterator( UtfString *pSrc, int iCodePos ) : | ||
| 81 | pSrc( pSrc ), iCodePos( iCodePos ) | ||
| 82 | { | ||
| 83 | } | ||
| 84 | |||
| 85 | public: | ||
| 86 | iterator() : | ||
| 87 | pSrc( NULL ), iCodePos( 0 ) | ||
| 88 | { | ||
| 89 | } | ||
| 90 | |||
| 91 | UtfChar operator*() | ||
| 92 | { | ||
| 93 | if( !pSrc ) | ||
| 94 | throw Bu::ExceptionBase("invalid UtfString::iterator dereferenced."); | ||
| 95 | return pSrc->nextChar( iCodePos ); | ||
| 96 | } | ||
| 97 | |||
| 98 | private: | ||
| 99 | UtfString *pSrc; | ||
| 100 | int iCodePos; | ||
| 101 | }; | ||
| 102 | |||
| 103 | /** | ||
| 104 | * Append a UtfChar (A unicode code point) to the string. This can be | ||
| 105 | * any valid code point, and is just the value of the code point, no | ||
| 106 | * encoding necessary. | ||
| 107 | */ | ||
| 108 | void append( UtfChar ch ); | ||
| 109 | |||
| 110 | /** | ||
| 111 | * Set the value of the entire string based on the given input and | ||
| 112 | * encoding. The default encoding is Utf8, which is compatible with | ||
| 113 | * 7-bit ascii, so it's a great choice for setting UtfStrings from | ||
| 114 | * string literals in code. | ||
| 115 | */ | ||
| 116 | void set( const Bu::String &sInput, Encoding eEnc=Utf8 ); | ||
| 117 | |||
| 118 | /** | ||
| 119 | * This encodes the UtfString in the given encoding and outputs it to | ||
| 120 | * the provided stream. all Utf16 and Utf32 encodings will have the | ||
| 121 | * correct BOM (byte order marker) at the begining. | ||
| 122 | */ | ||
| 123 | void write( Bu::Stream &sOut, Encoding eEnc=Utf8 ); | ||
| 124 | |||
| 125 | /** | ||
| 126 | * This encodes the UtfString in the given encoding and returns it as | ||
| 127 | * a binary Bu::String. Like write, this also includes the proper BOM | ||
| 128 | * at the begining. | ||
| 129 | */ | ||
| 130 | Bu::String get( Encoding eEnc=Utf8 ); | ||
| 131 | |||
| 132 | void debug(); | ||
| 133 | |||
| 134 | /** | ||
| 135 | * This may or may not stick around, given an index, this returns a | ||
| 136 | * codepoint, however there isn't necesarilly a 1:1 ratio between | ||
| 137 | * indexes and code points. | ||
| 138 | */ | ||
| 139 | UtfChar get( int iIndex ); | ||
| 140 | |||
| 141 | /** | ||
| 142 | * This is what to use if you want to iterate through a section of the | ||
| 143 | * UtfString and you want to use a numerical index. In most cases it | ||
| 144 | * will be much easier to use an iterator, though. Given an index this | ||
| 145 | * will return the codepoint at that position and increment iIndex an | ||
| 146 | * appropriate amount for it to point to the next code point. | ||
| 147 | */ | ||
| 148 | UtfChar nextChar( int &iIndex ); | ||
| 149 | |||
| 150 | private: | ||
| 151 | void append16( uint16_t i ) { aData.append( i ); } | ||
| 152 | |||
| 153 | void setUtf8( const Bu::String &sInput ); | ||
| 154 | void setUtf16( const Bu::String &sInput ); | ||
| 155 | void setUtf16be( const Bu::String &sInput ); | ||
| 156 | void setUtf16le( const Bu::String &sInput ); | ||
| 157 | void setUtf32( const Bu::String &sInput ); | ||
| 158 | void setUtf32be( const Bu::String &sInput ); | ||
| 159 | void setUtf32le( const Bu::String &sInput ); | ||
| 160 | |||
| 161 | void writeUtf8( Bu::Stream &sOut ); | ||
| 162 | void writeUtf16be( Bu::Stream &sOut ); | ||
| 163 | void writeUtf16le( Bu::Stream &sOut ); | ||
| 164 | void writeUtf32be( Bu::Stream &sOut ); | ||
| 165 | void writeUtf32le( Bu::Stream &sOut ); | ||
| 166 | |||
| 167 | private: | ||
| 168 | Bu::Array<uint16_t> aData; | ||
| 169 | int iRawLen; | ||
| 170 | int iCharLen; | ||
| 171 | }; | ||
| 172 | }; | ||
| 173 | |||
| 174 | #endif | ||
