diff options
Diffstat (limited to 'src/unstable/utfstring.h')
-rw-r--r-- | src/unstable/utfstring.h | 482 |
1 files changed, 241 insertions, 241 deletions
diff --git a/src/unstable/utfstring.h b/src/unstable/utfstring.h index 1bd4cce..560faae 100644 --- a/src/unstable/utfstring.h +++ b/src/unstable/utfstring.h | |||
@@ -13,247 +13,247 @@ | |||
13 | 13 | ||
14 | namespace Bu | 14 | namespace Bu |
15 | { | 15 | { |
16 | class String; | 16 | class String; |
17 | class Stream; | 17 | class Stream; |
18 | 18 | ||
19 | /** | 19 | /** |
20 | * UtfChar isn't actually a character, unicode specifies "code points" not | 20 | * UtfChar isn't actually a character, unicode specifies "code points" not |
21 | * characters. The main reason for this is that not all code points define | 21 | * characters. The main reason for this is that not all code points define |
22 | * usable characters. Some control text directionality, some apply | 22 | * usable characters. Some control text directionality, some apply |
23 | * properties to other code points which are characters. However, most of | 23 | * properties to other code points which are characters. However, most of |
24 | * these distinctions are only important when implementing displays that | 24 | * these distinctions are only important when implementing displays that |
25 | * comply with the Unicode standard fully. | 25 | * comply with the Unicode standard fully. |
26 | */ | 26 | */ |
27 | typedef uint32_t UtfChar; | 27 | typedef uint32_t UtfChar; |
28 | 28 | ||
29 | /** | 29 | /** |
30 | * A unicode string. This class represents a string of unicode code points. | 30 | * A unicode string. This class represents a string of unicode code points. |
31 | * Every character in unicode can be represented with 21 bits, but we don't | 31 | * Every character in unicode can be represented with 21 bits, but we don't |
32 | * have a datatype that's 24 bits long, so we return all code points as a | 32 | * have a datatype that's 24 bits long, so we return all code points as a |
33 | * 32 bit unsigned value represented by Bu::UtfChar. However, the UtfString | 33 | * 32 bit unsigned value represented by Bu::UtfChar. However, the UtfString |
34 | * class, for efficiency purposes doesn't store 32 bit values internally. | 34 | * class, for efficiency purposes doesn't store 32 bit values internally. |
35 | * It represents all code points in the native utf16 encodeng. This means | 35 | * It represents all code points in the native utf16 encodeng. This means |
36 | * that it may be very difficult to quickly determine the length of a | 36 | * that it may be very difficult to quickly determine the length of a |
37 | * UtfString in code points. Unlike many Unicode handling systems, this | 37 | * UtfString in code points. Unlike many Unicode handling systems, this |
38 | * one actually works with complete code points. When using this class you | 38 | * one actually works with complete code points. When using this class you |
39 | * don't ever have to know about the inner workings of the different | 39 | * don't ever have to know about the inner workings of the different |
40 | * encoding schemes. All of the data is dealt with as whole code points. | 40 | * encoding schemes. All of the data is dealt with as whole code points. |
41 | * | 41 | * |
42 | * As an aside, this means that when encoding a UtfString to a Utf16 | 42 | * As an aside, this means that when encoding a UtfString to a Utf16 |
43 | * encoding that matches your archetecture this operation will be very | 43 | * encoding that matches your archetecture this operation will be very |
44 | * fast since it will effectively be a raw dump of the internal data | 44 | * fast since it will effectively be a raw dump of the internal data |
45 | * structures. However, it is highly reccomended that you DO NOT use the | 45 | * structures. However, it is highly reccomended that you DO NOT use the |
46 | * little endian encodings if you can possibly avoid it. They are not | 46 | * little endian encodings if you can possibly avoid it. They are not |
47 | * reccomended by the Unicode Consortium and are mainly supported as a | 47 | * reccomended by the Unicode Consortium and are mainly supported as a |
48 | * means of communicating with other systems that encode their data | 48 | * means of communicating with other systems that encode their data |
49 | * incorrectly. That said, whenever UtfString encodes the contained string | 49 | * incorrectly. That said, whenever UtfString encodes the contained string |
50 | * it always includes a BOM at the begining (the byte order marker) so that | 50 | * it always includes a BOM at the begining (the byte order marker) so that |
51 | * proper byte order can be easily determined by the program reading the | 51 | * proper byte order can be easily determined by the program reading the |
52 | * data. | 52 | * data. |
53 | * | 53 | * |
54 | *@todo Investigate http://www.unicode.org/reports/tr6/ for compression. | 54 | *@todo Investigate http://www.unicode.org/reports/tr6/ for compression. |
55 | */ | 55 | */ |
56 | class UtfString | 56 | class UtfString |
57 | { | 57 | { |
58 | public: | 58 | public: |
59 | enum Encoding | 59 | enum Encoding |
60 | { | 60 | { |
61 | Utf8, | 61 | Utf8, |
62 | Utf16, | 62 | Utf16, |
63 | Utf16be, | 63 | Utf16be, |
64 | Utf16le, | 64 | Utf16le, |
65 | Utf32, | 65 | Utf32, |
66 | Utf32be, | 66 | Utf32be, |
67 | Utf32le, | 67 | Utf32le, |
68 | Ucs2, | 68 | Ucs2, |
69 | Ucs4, | 69 | Ucs4, |
70 | GuessEncoding | 70 | GuessEncoding |
71 | }; | 71 | }; |
72 | 72 | ||
73 | UtfString(); | 73 | UtfString(); |
74 | UtfString( const Bu::String &sInput, Encoding eEnc=Utf8 ); | 74 | UtfString( const Bu::String &sInput, Encoding eEnc=Utf8 ); |
75 | UtfString( const char *sInput, Encoding eEnc=Utf8 ); | 75 | UtfString( const char *sInput, Encoding eEnc=Utf8 ); |
76 | virtual ~UtfString(); | 76 | virtual ~UtfString(); |
77 | 77 | ||
78 | class iterator | 78 | class iterator |
79 | { | 79 | { |
80 | friend class UtfString; | 80 | friend class UtfString; |
81 | private: | 81 | private: |
82 | iterator( UtfString *pSrc, int iCodePos ) : | 82 | iterator( UtfString *pSrc, int iCodePos ) : |
83 | pSrc( pSrc ), iCodePos( iCodePos ) | 83 | pSrc( pSrc ), iCodePos( iCodePos ) |
84 | { | 84 | { |
85 | } | 85 | } |
86 | 86 | ||
87 | public: | 87 | public: |
88 | iterator() : | 88 | iterator() : |
89 | pSrc( NULL ), iCodePos( 0 ) | 89 | pSrc( NULL ), iCodePos( 0 ) |
90 | { | 90 | { |
91 | } | 91 | } |
92 | 92 | ||
93 | UtfChar operator*() | 93 | UtfChar operator*() |
94 | { | 94 | { |
95 | if( !pSrc ) | 95 | if( !pSrc ) |
96 | throw Bu::ExceptionBase("invalid UtfString::iterator dereferenced."); | 96 | throw Bu::ExceptionBase("invalid UtfString::iterator dereferenced."); |
97 | return pSrc->get( iCodePos ); | 97 | return pSrc->get( iCodePos ); |
98 | } | 98 | } |
99 | 99 | ||
100 | iterator operator++() | 100 | iterator operator++() |
101 | { | 101 | { |
102 | pSrc->nextChar( iCodePos ); | 102 | pSrc->nextChar( iCodePos ); |
103 | return *this; | 103 | return *this; |
104 | } | 104 | } |
105 | 105 | ||
106 | iterator operator++( int ) | 106 | iterator operator++( int ) |
107 | { | 107 | { |
108 | pSrc->nextChar( iCodePos ); | 108 | pSrc->nextChar( iCodePos ); |
109 | return *this; | 109 | return *this; |
110 | } | 110 | } |
111 | 111 | ||
112 | operator bool() const | 112 | operator bool() const |
113 | { | 113 | { |
114 | return iCodePos < pSrc->aData.getSize(); | 114 | return iCodePos < pSrc->aData.getSize(); |
115 | } | 115 | } |
116 | 116 | ||
117 | private: | 117 | private: |
118 | UtfString *pSrc; | 118 | UtfString *pSrc; |
119 | int iCodePos; | 119 | int iCodePos; |
120 | }; | 120 | }; |
121 | 121 | ||
122 | class const_iterator | 122 | class const_iterator |
123 | { | 123 | { |
124 | friend class UtfString; | 124 | friend class UtfString; |
125 | private: | 125 | private: |
126 | const_iterator( const UtfString *pSrc, int iCodePos ) : | 126 | const_iterator( const UtfString *pSrc, int iCodePos ) : |
127 | pSrc( pSrc ), iCodePos( iCodePos ) | 127 | pSrc( pSrc ), iCodePos( iCodePos ) |
128 | { | 128 | { |
129 | } | 129 | } |
130 | 130 | ||
131 | public: | 131 | public: |
132 | const_iterator() : | 132 | const_iterator() : |
133 | pSrc( NULL ), iCodePos( 0 ) | 133 | pSrc( NULL ), iCodePos( 0 ) |
134 | { | 134 | { |
135 | } | 135 | } |
136 | 136 | ||
137 | UtfChar operator*() | 137 | UtfChar operator*() |
138 | { | 138 | { |
139 | if( !pSrc ) | 139 | if( !pSrc ) |
140 | throw Bu::ExceptionBase("invalid UtfString::iterator dereferenced."); | 140 | throw Bu::ExceptionBase("invalid UtfString::iterator dereferenced."); |
141 | return pSrc->get( iCodePos ); | 141 | return pSrc->get( iCodePos ); |
142 | } | 142 | } |
143 | 143 | ||
144 | const_iterator operator++() | 144 | const_iterator operator++() |
145 | { | 145 | { |
146 | pSrc->nextChar( iCodePos ); | 146 | pSrc->nextChar( iCodePos ); |
147 | return *this; | 147 | return *this; |
148 | } | 148 | } |
149 | 149 | ||
150 | const_iterator operator++( int ) | 150 | const_iterator operator++( int ) |
151 | { | 151 | { |
152 | pSrc->nextChar( iCodePos ); | 152 | pSrc->nextChar( iCodePos ); |
153 | return *this; | 153 | return *this; |
154 | } | 154 | } |
155 | 155 | ||
156 | operator bool() const | 156 | operator bool() const |
157 | { | 157 | { |
158 | return iCodePos < pSrc->aData.getSize(); | 158 | return iCodePos < pSrc->aData.getSize(); |
159 | } | 159 | } |
160 | 160 | ||
161 | private: | 161 | private: |
162 | const UtfString *pSrc; | 162 | const UtfString *pSrc; |
163 | int iCodePos; | 163 | int iCodePos; |
164 | }; | 164 | }; |
165 | 165 | ||
166 | iterator begin(); | 166 | iterator begin(); |
167 | const_iterator begin() const; | 167 | const_iterator begin() const; |
168 | 168 | ||
169 | /** | 169 | /** |
170 | * Append a UtfChar (A unicode code point) to the string. This can be | 170 | * Append a UtfChar (A unicode code point) to the string. This can be |
171 | * any valid code point, and is just the value of the code point, no | 171 | * any valid code point, and is just the value of the code point, no |
172 | * encoding necessary. | 172 | * encoding necessary. |
173 | */ | 173 | */ |
174 | void append( UtfChar ch ); | 174 | void append( UtfChar ch ); |
175 | 175 | ||
176 | void append( const UtfString &rSrc ); | 176 | void append( const UtfString &rSrc ); |
177 | 177 | ||
178 | /** | 178 | /** |
179 | * Set the value of the entire string based on the given input and | 179 | * Set the value of the entire string based on the given input and |
180 | * encoding. The default encoding is Utf8, which is compatible with | 180 | * encoding. The default encoding is Utf8, which is compatible with |
181 | * 7-bit ascii, so it's a great choice for setting UtfStrings from | 181 | * 7-bit ascii, so it's a great choice for setting UtfStrings from |
182 | * string literals in code. | 182 | * string literals in code. |
183 | */ | 183 | */ |
184 | void set( const Bu::String &sInput, Encoding eEnc=Utf8 ); | 184 | void set( const Bu::String &sInput, Encoding eEnc=Utf8 ); |
185 | 185 | ||
186 | /** | 186 | /** |
187 | * This encodes the UtfString in the given encoding and outputs it to | 187 | * This encodes the UtfString in the given encoding and outputs it to |
188 | * the provided stream. all Utf16 and Utf32 encodings will have the | 188 | * the provided stream. all Utf16 and Utf32 encodings will have the |
189 | * correct BOM (byte order marker) at the begining. | 189 | * correct BOM (byte order marker) at the begining. |
190 | */ | 190 | */ |
191 | void write( Bu::Stream &sOut, Encoding eEnc=Utf8 ) const; | 191 | void write( Bu::Stream &sOut, Encoding eEnc=Utf8 ) const; |
192 | 192 | ||
193 | /** | 193 | /** |
194 | * This encodes the UtfString in the given encoding and returns it as | 194 | * This encodes the UtfString in the given encoding and returns it as |
195 | * a binary Bu::String. Like write, this also includes the proper BOM | 195 | * a binary Bu::String. Like write, this also includes the proper BOM |
196 | * at the begining. | 196 | * at the begining. |
197 | */ | 197 | */ |
198 | Bu::String get( Encoding eEnc=Utf8 ) const; | 198 | Bu::String get( Encoding eEnc=Utf8 ) const; |
199 | 199 | ||
200 | void debug() const; | 200 | void debug() const; |
201 | 201 | ||
202 | /** | 202 | /** |
203 | * This may or may not stick around, given an index, this returns a | 203 | * This may or may not stick around, given an index, this returns a |
204 | * codepoint, however there isn't necesarilly a 1:1 ratio between | 204 | * codepoint, however there isn't necesarilly a 1:1 ratio between |
205 | * indexes and code points. | 205 | * indexes and code points. |
206 | */ | 206 | */ |
207 | UtfChar get( int iIndex ) const; | 207 | UtfChar get( int iIndex ) const; |
208 | 208 | ||
209 | /** | 209 | /** |
210 | * This is what to use if you want to iterate through a section of the | 210 | * This is what to use if you want to iterate through a section of the |
211 | * UtfString and you want to use a numerical index. In most cases it | 211 | * UtfString and you want to use a numerical index. In most cases it |
212 | * will be much easier to use an iterator, though. Given an index this | 212 | * will be much easier to use an iterator, though. Given an index this |
213 | * will return the codepoint at that position and increment iIndex an | 213 | * will return the codepoint at that position and increment iIndex an |
214 | * appropriate amount for it to point to the next code point. | 214 | * appropriate amount for it to point to the next code point. |
215 | */ | 215 | */ |
216 | UtfChar nextChar( int &iIndex ) const; | 216 | UtfChar nextChar( int &iIndex ) const; |
217 | 217 | ||
218 | bool operator==( const Bu::UtfString &rhs ) const; | 218 | bool operator==( const Bu::UtfString &rhs ) const; |
219 | UtfString &operator+=( const Bu::UtfString &rhs ); | 219 | UtfString &operator+=( const Bu::UtfString &rhs ); |
220 | UtfString &operator+=( const UtfChar &rhs ); | 220 | UtfString &operator+=( const UtfChar &rhs ); |
221 | 221 | ||
222 | private: | 222 | private: |
223 | void append16( uint16_t i ) { aData.append( i ); } | 223 | void append16( uint16_t i ) { aData.append( i ); } |
224 | 224 | ||
225 | void setUtf8( const Bu::String &sInput ); | 225 | void setUtf8( const Bu::String &sInput ); |
226 | void setUtf16( const Bu::String &sInput ); | 226 | void setUtf16( const Bu::String &sInput ); |
227 | void setUtf16be( const Bu::String &sInput ); | 227 | void setUtf16be( const Bu::String &sInput ); |
228 | void setUtf16le( const Bu::String &sInput ); | 228 | void setUtf16le( const Bu::String &sInput ); |
229 | void setUtf32( const Bu::String &sInput ); | 229 | void setUtf32( const Bu::String &sInput ); |
230 | void setUtf32be( const Bu::String &sInput ); | 230 | void setUtf32be( const Bu::String &sInput ); |
231 | void setUtf32le( const Bu::String &sInput ); | 231 | void setUtf32le( const Bu::String &sInput ); |
232 | 232 | ||
233 | void writeUtf8( Bu::Stream &sOut ) const; | 233 | void writeUtf8( Bu::Stream &sOut ) const; |
234 | void writeUtf16be( Bu::Stream &sOut ) const; | 234 | void writeUtf16be( Bu::Stream &sOut ) const; |
235 | void writeUtf16le( Bu::Stream &sOut ) const; | 235 | void writeUtf16le( Bu::Stream &sOut ) const; |
236 | void writeUtf32be( Bu::Stream &sOut ) const; | 236 | void writeUtf32be( Bu::Stream &sOut ) const; |
237 | void writeUtf32le( Bu::Stream &sOut ) const; | 237 | void writeUtf32le( Bu::Stream &sOut ) const; |
238 | 238 | ||
239 | private: | 239 | private: |
240 | Bu::Array<uint16_t> aData; | 240 | Bu::Array<uint16_t> aData; |
241 | int iRawLen; | 241 | int iRawLen; |
242 | int iCharLen; | 242 | int iCharLen; |
243 | }; | 243 | }; |
244 | 244 | ||
245 | // | 245 | // |
246 | // Hash support | 246 | // Hash support |
247 | // | 247 | // |
248 | template<typename T> | 248 | template<typename T> |
249 | uint32_t __calcHashCode( const T &k ); | 249 | uint32_t __calcHashCode( const T &k ); |
250 | 250 | ||
251 | template<typename T> | 251 | template<typename T> |
252 | bool __cmpHashKeys( const T &a, const T &b ); | 252 | bool __cmpHashKeys( const T &a, const T &b ); |
253 | 253 | ||
254 | template<> uint32_t __calcHashCode<UtfString>( const UtfString &k ); | 254 | template<> uint32_t __calcHashCode<UtfString>( const UtfString &k ); |
255 | template<> bool __cmpHashKeys<UtfString>( | 255 | template<> bool __cmpHashKeys<UtfString>( |
256 | const UtfString &a, const UtfString &b ); | 256 | const UtfString &a, const UtfString &b ); |
257 | }; | 257 | }; |
258 | 258 | ||
259 | #endif | 259 | #endif |