diff options
Diffstat (limited to 'src/unstable/utfstring.cpp')
-rw-r--r-- | src/unstable/utfstring.cpp | 842 |
1 files changed, 421 insertions, 421 deletions
diff --git a/src/unstable/utfstring.cpp b/src/unstable/utfstring.cpp index 9fe2d02..421d5fb 100644 --- a/src/unstable/utfstring.cpp +++ b/src/unstable/utfstring.cpp | |||
@@ -20,12 +20,12 @@ Bu::UtfString::UtfString() | |||
20 | 20 | ||
21 | Bu::UtfString::UtfString( const Bu::String &sInput, Encoding eEnc ) | 21 | Bu::UtfString::UtfString( const Bu::String &sInput, Encoding eEnc ) |
22 | { | 22 | { |
23 | set( sInput, eEnc ); | 23 | set( sInput, eEnc ); |
24 | } | 24 | } |
25 | 25 | ||
26 | Bu::UtfString::UtfString( const char *sInput, Encoding eEnc ) | 26 | Bu::UtfString::UtfString( const char *sInput, Encoding eEnc ) |
27 | { | 27 | { |
28 | set( sInput, eEnc ); | 28 | set( sInput, eEnc ); |
29 | } | 29 | } |
30 | 30 | ||
31 | Bu::UtfString::~UtfString() | 31 | Bu::UtfString::~UtfString() |
@@ -34,340 +34,340 @@ Bu::UtfString::~UtfString() | |||
34 | 34 | ||
35 | Bu::UtfString::iterator Bu::UtfString::begin() | 35 | Bu::UtfString::iterator Bu::UtfString::begin() |
36 | { | 36 | { |
37 | return Bu::UtfString::iterator( this, 0 ); | 37 | return Bu::UtfString::iterator( this, 0 ); |
38 | } | 38 | } |
39 | 39 | ||
40 | Bu::UtfString::const_iterator Bu::UtfString::begin() const | 40 | Bu::UtfString::const_iterator Bu::UtfString::begin() const |
41 | { | 41 | { |
42 | return Bu::UtfString::const_iterator( this, 0 ); | 42 | return Bu::UtfString::const_iterator( this, 0 ); |
43 | } | 43 | } |
44 | 44 | ||
45 | void Bu::UtfString::set( const Bu::String &sInput, Encoding eEnc ) | 45 | void Bu::UtfString::set( const Bu::String &sInput, Encoding eEnc ) |
46 | { | 46 | { |
47 | switch( eEnc ) | 47 | switch( eEnc ) |
48 | { | 48 | { |
49 | case Utf8: | 49 | case Utf8: |
50 | setUtf8( sInput ); | 50 | setUtf8( sInput ); |
51 | break; | 51 | break; |
52 | 52 | ||
53 | case Utf16: | 53 | case Utf16: |
54 | setUtf16( sInput ); | 54 | setUtf16( sInput ); |
55 | break; | 55 | break; |
56 | 56 | ||
57 | case Utf16be: | 57 | case Utf16be: |
58 | setUtf16be( sInput ); | 58 | setUtf16be( sInput ); |
59 | break; | 59 | break; |
60 | 60 | ||
61 | case Utf16le: | 61 | case Utf16le: |
62 | setUtf16le( sInput ); | 62 | setUtf16le( sInput ); |
63 | break; | 63 | break; |
64 | 64 | ||
65 | case Utf32: | 65 | case Utf32: |
66 | setUtf32( sInput ); | 66 | setUtf32( sInput ); |
67 | break; | 67 | break; |
68 | 68 | ||
69 | case Utf32be: | 69 | case Utf32be: |
70 | setUtf32be( sInput ); | 70 | setUtf32be( sInput ); |
71 | break; | 71 | break; |
72 | 72 | ||
73 | case Utf32le: | 73 | case Utf32le: |
74 | setUtf32le( sInput ); | 74 | setUtf32le( sInput ); |
75 | break; | 75 | break; |
76 | 76 | ||
77 | case Ucs2: | 77 | case Ucs2: |
78 | throw Bu::ExceptionBase("Ucs2 not supported yet."); | 78 | throw Bu::ExceptionBase("Ucs2 not supported yet."); |
79 | break; | 79 | break; |
80 | 80 | ||
81 | case Ucs4: | 81 | case Ucs4: |
82 | throw Bu::ExceptionBase("Ucs4 not supported yet."); | 82 | throw Bu::ExceptionBase("Ucs4 not supported yet."); |
83 | break; | 83 | break; |
84 | 84 | ||
85 | case GuessEncoding: | 85 | case GuessEncoding: |
86 | throw Bu::ExceptionBase("Guessing mode not supported yet."); | 86 | throw Bu::ExceptionBase("Guessing mode not supported yet."); |
87 | break; | 87 | break; |
88 | } | 88 | } |
89 | } | 89 | } |
90 | 90 | ||
91 | void Bu::UtfString::append( UtfChar ch ) | 91 | void Bu::UtfString::append( UtfChar ch ) |
92 | { | 92 | { |
93 | if( ch >= 0x10000 ) | 93 | if( ch >= 0x10000 ) |
94 | { | 94 | { |
95 | ch -= 0x10000; | 95 | ch -= 0x10000; |
96 | append16( ((ch>>10)&0x3FF)| 0xD800u ); | 96 | append16( ((ch>>10)&0x3FF)| 0xD800u ); |
97 | append16( (ch&0x3FF)| 0xDC00u ); | 97 | append16( (ch&0x3FF)| 0xDC00u ); |
98 | } | 98 | } |
99 | else | 99 | else |
100 | { | 100 | { |
101 | append16( (uint16_t)(ch) ); | 101 | append16( (uint16_t)(ch) ); |
102 | } | 102 | } |
103 | } | 103 | } |
104 | 104 | ||
105 | void Bu::UtfString::append( const UtfString &rSrc ) | 105 | void Bu::UtfString::append( const UtfString &rSrc ) |
106 | { | 106 | { |
107 | aData.append( rSrc.aData ); | 107 | aData.append( rSrc.aData ); |
108 | iRawLen += rSrc.iRawLen; | 108 | iRawLen += rSrc.iRawLen; |
109 | iCharLen += rSrc.iCharLen; | 109 | iCharLen += rSrc.iCharLen; |
110 | } | 110 | } |
111 | 111 | ||
112 | void Bu::UtfString::setUtf8( const Bu::String &sInput ) | 112 | void Bu::UtfString::setUtf8( const Bu::String &sInput ) |
113 | { | 113 | { |
114 | static uint8_t lmask[8] = { | 114 | static uint8_t lmask[8] = { |
115 | 0x00, | 115 | 0x00, |
116 | 0x01, | 116 | 0x01, |
117 | 0x03, | 117 | 0x03, |
118 | 0x07, | 118 | 0x07, |
119 | 0x0f, | 119 | 0x0f, |
120 | 0x1f, | 120 | 0x1f, |
121 | 0x3f, | 121 | 0x3f, |
122 | 0x7f | 122 | 0x7f |
123 | }; | 123 | }; |
124 | for( Bu::String::const_iterator i = sInput.begin(); i; i++ ) | 124 | for( Bu::String::const_iterator i = sInput.begin(); i; i++ ) |
125 | { | 125 | { |
126 | if( ((int)(uint8_t)*i)&0x80 ) | 126 | if( ((int)(uint8_t)*i)&0x80 ) |
127 | { | 127 | { |
128 | int iBytes = 1; | 128 | int iBytes = 1; |
129 | for(; (((uint8_t)(*i))<<iBytes)&0x80; iBytes++ ) { } | 129 | for(; (((uint8_t)(*i))<<iBytes)&0x80; iBytes++ ) { } |
130 | Bu::UtfChar uPt = ((*i) & lmask[7-iBytes])<<(6*(iBytes-1)); | 130 | Bu::UtfChar uPt = ((*i) & lmask[7-iBytes])<<(6*(iBytes-1)); |
131 | for( iBytes--; iBytes >= 1; iBytes-- ) | 131 | for( iBytes--; iBytes >= 1; iBytes-- ) |
132 | { | 132 | { |
133 | i++; | 133 | i++; |
134 | uPt |= ((*i)&lmask[6])<<(6*(iBytes-1)); | 134 | uPt |= ((*i)&lmask[6])<<(6*(iBytes-1)); |
135 | } | 135 | } |
136 | append( uPt ); | 136 | append( uPt ); |
137 | } | 137 | } |
138 | else | 138 | else |
139 | { | 139 | { |
140 | append( (Bu::UtfChar)(*i) ); | 140 | append( (Bu::UtfChar)(*i) ); |
141 | } | 141 | } |
142 | } | 142 | } |
143 | } | 143 | } |
144 | 144 | ||
145 | void Bu::UtfString::setUtf16( const Bu::String &sInput ) | 145 | void Bu::UtfString::setUtf16( const Bu::String &sInput ) |
146 | { | 146 | { |
147 | // Bu::String::const_iterator i = sInput.begin(); | 147 | // Bu::String::const_iterator i = sInput.begin(); |
148 | if( (uint8_t)*sInput.begin() == 0xFF && | 148 | if( (uint8_t)*sInput.begin() == 0xFF && |
149 | (uint8_t)*(sInput.begin()+1) == 0xFE ) | 149 | (uint8_t)*(sInput.begin()+1) == 0xFE ) |
150 | { | 150 | { |
151 | setUtf16le( sInput ); | 151 | setUtf16le( sInput ); |
152 | return; | 152 | return; |
153 | } | 153 | } |
154 | setUtf16be( sInput ); | 154 | setUtf16be( sInput ); |
155 | } | 155 | } |
156 | 156 | ||
157 | void Bu::UtfString::setUtf16be( const Bu::String &sInput ) | 157 | void Bu::UtfString::setUtf16be( const Bu::String &sInput ) |
158 | { | 158 | { |
159 | Bu::String::const_iterator i = sInput.begin(); | 159 | Bu::String::const_iterator i = sInput.begin(); |
160 | if( (uint8_t)*sInput.begin() == 0xFE && | 160 | if( (uint8_t)*sInput.begin() == 0xFE && |
161 | (uint8_t)*(sInput.begin()+1) == 0xFF ) | 161 | (uint8_t)*(sInput.begin()+1) == 0xFF ) |
162 | 162 | ||
163 | { | 163 | { |
164 | i += 2; | 164 | i += 2; |
165 | sio << "Verified big endian." << sio.nl; | 165 | sio << "Verified big endian." << sio.nl; |
166 | } | 166 | } |
167 | else | 167 | else |
168 | { | 168 | { |
169 | sio << "Assuming big endian." << sio.nl; | 169 | sio << "Assuming big endian." << sio.nl; |
170 | } | 170 | } |
171 | uint16_t hi, lo; | 171 | uint16_t hi, lo; |
172 | for( ; i; i++ ) | 172 | for( ; i; i++ ) |
173 | { | 173 | { |
174 | hi = (((uint8_t)*i)<<8) | ((uint8_t)*(++i)); | 174 | hi = (((uint8_t)*i)<<8) | ((uint8_t)*(++i)); |
175 | append16( hi ); | 175 | append16( hi ); |
176 | if( (hi&0xD800u) == 0xD800u ) | 176 | if( (hi&0xD800u) == 0xD800u ) |
177 | { | 177 | { |
178 | lo = (((uint8_t)*(++i))<<8) | ((uint8_t)*(++i)); | 178 | lo = (((uint8_t)*(++i))<<8) | ((uint8_t)*(++i)); |
179 | append16( lo ); | 179 | append16( lo ); |
180 | } | 180 | } |
181 | } | 181 | } |
182 | } | 182 | } |
183 | 183 | ||
184 | void Bu::UtfString::setUtf16le( const Bu::String &sInput ) | 184 | void Bu::UtfString::setUtf16le( const Bu::String &sInput ) |
185 | { | 185 | { |
186 | Bu::String::const_iterator i = sInput.begin(); | 186 | Bu::String::const_iterator i = sInput.begin(); |
187 | if( (uint8_t)*sInput.begin() == 0xFF && | 187 | if( (uint8_t)*sInput.begin() == 0xFF && |
188 | (uint8_t)*(sInput.begin()+1) == 0xFE ) | 188 | (uint8_t)*(sInput.begin()+1) == 0xFE ) |
189 | { | 189 | { |
190 | i += 2; | 190 | i += 2; |
191 | sio << "Verified little endian." << sio.nl; | 191 | sio << "Verified little endian." << sio.nl; |
192 | } | 192 | } |
193 | else | 193 | else |
194 | { | 194 | { |
195 | sio << "Assuming little endian." << sio.nl; | 195 | sio << "Assuming little endian." << sio.nl; |
196 | } | 196 | } |
197 | uint16_t hi, lo; | 197 | uint16_t hi, lo; |
198 | for( ; i; i++ ) | 198 | for( ; i; i++ ) |
199 | { | 199 | { |
200 | hi = (((uint8_t)*i)) | ((uint8_t)*(++i)<<8); | 200 | hi = (((uint8_t)*i)) | ((uint8_t)*(++i)<<8); |
201 | append16( hi ); | 201 | append16( hi ); |
202 | if( (hi&0xD800u) == 0xD800u ) | 202 | if( (hi&0xD800u) == 0xD800u ) |
203 | { | 203 | { |
204 | lo = (((uint8_t)*(++i))) | ((uint8_t)*(++i)<<8); | 204 | lo = (((uint8_t)*(++i))) | ((uint8_t)*(++i)<<8); |
205 | append16( lo ); | 205 | append16( lo ); |
206 | } | 206 | } |
207 | } | 207 | } |
208 | } | 208 | } |
209 | 209 | ||
210 | void Bu::UtfString::setUtf32( const Bu::String &sInput ) | 210 | void Bu::UtfString::setUtf32( const Bu::String &sInput ) |
211 | { | 211 | { |
212 | Bu::String::const_iterator i = sInput.begin(); | 212 | Bu::String::const_iterator i = sInput.begin(); |
213 | if( (uint8_t)*i == 0x00 && | 213 | if( (uint8_t)*i == 0x00 && |
214 | (uint8_t)*(++i) == 0x00 && | 214 | (uint8_t)*(++i) == 0x00 && |
215 | (uint8_t)*(++i) == 0xFF && | 215 | (uint8_t)*(++i) == 0xFF && |
216 | (uint8_t)*(++i) == 0xFE ) | 216 | (uint8_t)*(++i) == 0xFE ) |
217 | { | 217 | { |
218 | setUtf32le( sInput ); | 218 | setUtf32le( sInput ); |
219 | return; | 219 | return; |
220 | } | 220 | } |
221 | setUtf32be( sInput ); | 221 | setUtf32be( sInput ); |
222 | } | 222 | } |
223 | 223 | ||
224 | void Bu::UtfString::setUtf32be( const Bu::String &sInput ) | 224 | void Bu::UtfString::setUtf32be( const Bu::String &sInput ) |
225 | { | 225 | { |
226 | Bu::String::const_iterator i = sInput.begin(); | 226 | Bu::String::const_iterator i = sInput.begin(); |
227 | if( (uint8_t)*i == 0x00 && | 227 | if( (uint8_t)*i == 0x00 && |
228 | (uint8_t)*(++i) == 0x00 && | 228 | (uint8_t)*(++i) == 0x00 && |
229 | (uint8_t)*(++i) == 0xFE && | 229 | (uint8_t)*(++i) == 0xFE && |
230 | (uint8_t)*(++i) == 0xFF ) | 230 | (uint8_t)*(++i) == 0xFF ) |
231 | { | 231 | { |
232 | i++; | 232 | i++; |
233 | sio << "Verified big endian." << sio.nl; | 233 | sio << "Verified big endian." << sio.nl; |
234 | } | 234 | } |
235 | else | 235 | else |
236 | { | 236 | { |
237 | i = sInput.begin(); | 237 | i = sInput.begin(); |
238 | sio << "Assuming big endian." << sio.nl; | 238 | sio << "Assuming big endian." << sio.nl; |
239 | } | 239 | } |
240 | for( ; i; i++ ) | 240 | for( ; i; i++ ) |
241 | { | 241 | { |
242 | append( (((uint8_t)*i)<<24) | | 242 | append( (((uint8_t)*i)<<24) | |
243 | (((uint8_t)*(++i))<<16) | | 243 | (((uint8_t)*(++i))<<16) | |
244 | (((uint8_t)*(++i))<<8) | | 244 | (((uint8_t)*(++i))<<8) | |
245 | ((uint8_t)*(++i)) | 245 | ((uint8_t)*(++i)) |
246 | ); | 246 | ); |
247 | } | 247 | } |
248 | } | 248 | } |
249 | 249 | ||
250 | void Bu::UtfString::setUtf32le( const Bu::String &sInput ) | 250 | void Bu::UtfString::setUtf32le( const Bu::String &sInput ) |
251 | { | 251 | { |
252 | Bu::String::const_iterator i = sInput.begin(); | 252 | Bu::String::const_iterator i = sInput.begin(); |
253 | if( (uint8_t)*i == 0x00 && | 253 | if( (uint8_t)*i == 0x00 && |
254 | (uint8_t)*(++i) == 0x00 && | 254 | (uint8_t)*(++i) == 0x00 && |
255 | (uint8_t)*(++i) == 0xFF && | 255 | (uint8_t)*(++i) == 0xFF && |
256 | (uint8_t)*(++i) == 0xFE ) | 256 | (uint8_t)*(++i) == 0xFE ) |
257 | { | 257 | { |
258 | i++; | 258 | i++; |
259 | sio << "Verified little endian." << sio.nl; | 259 | sio << "Verified little endian." << sio.nl; |
260 | } | 260 | } |
261 | else | 261 | else |
262 | { | 262 | { |
263 | i = sInput.begin(); | 263 | i = sInput.begin(); |
264 | sio << "Assuming little endian." << sio.nl; | 264 | sio << "Assuming little endian." << sio.nl; |
265 | } | 265 | } |
266 | for( ; i; i++ ) | 266 | for( ; i; i++ ) |
267 | { | 267 | { |
268 | append( ((uint8_t)*i) | | 268 | append( ((uint8_t)*i) | |
269 | (((uint8_t)*(++i))<<8) | | 269 | (((uint8_t)*(++i))<<8) | |
270 | (((uint8_t)*(++i))<<16) | | 270 | (((uint8_t)*(++i))<<16) | |
271 | (((uint8_t)*(++i))<<24) | 271 | (((uint8_t)*(++i))<<24) |
272 | ); | 272 | ); |
273 | } | 273 | } |
274 | } | 274 | } |
275 | 275 | ||
276 | void Bu::UtfString::write( Bu::Stream &sOut, Encoding eEnc ) const | 276 | void Bu::UtfString::write( Bu::Stream &sOut, Encoding eEnc ) const |
277 | { | 277 | { |
278 | switch( eEnc ) | 278 | switch( eEnc ) |
279 | { | 279 | { |
280 | case Utf8: | 280 | case Utf8: |
281 | writeUtf8( sOut ); | 281 | writeUtf8( sOut ); |
282 | break; | 282 | break; |
283 | 283 | ||
284 | case Utf16: | 284 | case Utf16: |
285 | // writeUtf16( sOut ); | 285 | // writeUtf16( sOut ); |
286 | // break; | 286 | // break; |
287 | 287 | ||
288 | case Utf16be: | 288 | case Utf16be: |
289 | writeUtf16be( sOut ); | 289 | writeUtf16be( sOut ); |
290 | break; | 290 | break; |
291 | 291 | ||
292 | case Utf16le: | 292 | case Utf16le: |
293 | writeUtf16le( sOut ); | 293 | writeUtf16le( sOut ); |
294 | break; | 294 | break; |
295 | 295 | ||
296 | case Utf32: | 296 | case Utf32: |
297 | // writeUtf32( sOut ); | 297 | // writeUtf32( sOut ); |
298 | // break; | 298 | // break; |
299 | 299 | ||
300 | case Utf32be: | 300 | case Utf32be: |
301 | writeUtf32be( sOut ); | 301 | writeUtf32be( sOut ); |
302 | break; | 302 | break; |
303 | 303 | ||
304 | case Utf32le: | 304 | case Utf32le: |
305 | writeUtf32le( sOut ); | 305 | writeUtf32le( sOut ); |
306 | break; | 306 | break; |
307 | 307 | ||
308 | case Ucs2: | 308 | case Ucs2: |
309 | throw Bu::ExceptionBase("Ucs2 not supported yet."); | 309 | throw Bu::ExceptionBase("Ucs2 not supported yet."); |
310 | break; | 310 | break; |
311 | 311 | ||
312 | case Ucs4: | 312 | case Ucs4: |
313 | throw Bu::ExceptionBase("Ucs4 not supported yet."); | 313 | throw Bu::ExceptionBase("Ucs4 not supported yet."); |
314 | break; | 314 | break; |
315 | 315 | ||
316 | case GuessEncoding: | 316 | case GuessEncoding: |
317 | throw Bu::ExceptionBase( | 317 | throw Bu::ExceptionBase( |
318 | "GuessEncoding is incompatible with encoding."); | 318 | "GuessEncoding is incompatible with encoding."); |
319 | break; | 319 | break; |
320 | 320 | ||
321 | } | 321 | } |
322 | } | 322 | } |
323 | 323 | ||
324 | void Bu::UtfString::writeUtf8( Bu::Stream &sOut ) const | 324 | void Bu::UtfString::writeUtf8( Bu::Stream &sOut ) const |
325 | { | 325 | { |
326 | int iPos = 0; | 326 | int iPos = 0; |
327 | while( iPos < aData.getSize() ) | 327 | while( iPos < aData.getSize() ) |
328 | { | 328 | { |
329 | uint8_t uByte; | 329 | uint8_t uByte; |
330 | Bu::UtfChar chr = nextChar( iPos ); | 330 | Bu::UtfChar chr = nextChar( iPos ); |
331 | if( chr >= 0x010000 ) | 331 | if( chr >= 0x010000 ) |
332 | { | 332 | { |
333 | // Four bytes | 333 | // Four bytes |
334 | // 111 111111 111111 111111 | 334 | // 111 111111 111111 111111 |
335 | uByte = (chr>>18)|0xF0; | 335 | uByte = (chr>>18)|0xF0; |
336 | sOut.write( &uByte, 1 ); | 336 | sOut.write( &uByte, 1 ); |
337 | uByte = ((chr>>12)&0x3F)|0x80; | 337 | uByte = ((chr>>12)&0x3F)|0x80; |
338 | sOut.write( &uByte, 1 ); | 338 | sOut.write( &uByte, 1 ); |
339 | uByte = ((chr>>6)&0x3F)|0x80; | 339 | uByte = ((chr>>6)&0x3F)|0x80; |
340 | sOut.write( &uByte, 1 ); | 340 | sOut.write( &uByte, 1 ); |
341 | uByte = (chr&0x3F)|0x80; | 341 | uByte = (chr&0x3F)|0x80; |
342 | sOut.write( &uByte, 1 ); | 342 | sOut.write( &uByte, 1 ); |
343 | } | 343 | } |
344 | else if( chr >= 0x800 ) | 344 | else if( chr >= 0x800 ) |
345 | { | 345 | { |
346 | // Three bytes | 346 | // Three bytes |
347 | // 1111 111111 111111 | 347 | // 1111 111111 111111 |
348 | uByte = (chr>>12)|0xE0; | 348 | uByte = (chr>>12)|0xE0; |
349 | sOut.write( &uByte, 1 ); | 349 | sOut.write( &uByte, 1 ); |
350 | uByte = ((chr>>6)&0x3F)|0x80; | 350 | uByte = ((chr>>6)&0x3F)|0x80; |
351 | sOut.write( &uByte, 1 ); | 351 | sOut.write( &uByte, 1 ); |
352 | uByte = (chr&0x3F)|0x80; | 352 | uByte = (chr&0x3F)|0x80; |
353 | sOut.write( &uByte, 1 ); | 353 | sOut.write( &uByte, 1 ); |
354 | } | 354 | } |
355 | else if( chr >= 0x80 ) | 355 | else if( chr >= 0x80 ) |
356 | { | 356 | { |
357 | // Two bytes | 357 | // Two bytes |
358 | // 11111 111111 | 358 | // 11111 111111 |
359 | uByte = (chr>>6)|0xC0; | 359 | uByte = (chr>>6)|0xC0; |
360 | sOut.write( &uByte, 1 ); | 360 | sOut.write( &uByte, 1 ); |
361 | uByte = (chr&0x3F)|0x80; | 361 | uByte = (chr&0x3F)|0x80; |
362 | sOut.write( &uByte, 1 ); | 362 | sOut.write( &uByte, 1 ); |
363 | } | 363 | } |
364 | else | 364 | else |
365 | { | 365 | { |
366 | // One byte | 366 | // One byte |
367 | uByte = chr; | 367 | uByte = chr; |
368 | sOut.write( &uByte, 1 ); | 368 | sOut.write( &uByte, 1 ); |
369 | } | 369 | } |
370 | } | 370 | } |
371 | } | 371 | } |
372 | /* | 372 | /* |
373 | void Bu::UtfString::writeUtf16( Bu::Stream &sOut ) | 373 | void Bu::UtfString::writeUtf16( Bu::Stream &sOut ) |
@@ -377,228 +377,228 @@ void Bu::UtfString::writeUtf16( Bu::Stream &sOut ) | |||
377 | void Bu::UtfString::writeUtf16be( Bu::Stream &sOut ) const | 377 | void Bu::UtfString::writeUtf16be( Bu::Stream &sOut ) const |
378 | { | 378 | { |
379 | #if BYTE_ORDER == BIG_ENDIAN | 379 | #if BYTE_ORDER == BIG_ENDIAN |
380 | uint16_t iTmp = 0xFEFF; // Byte Order Marker | 380 | uint16_t iTmp = 0xFEFF; // Byte Order Marker |
381 | sOut.write( &iTmp, 2 ); | 381 | sOut.write( &iTmp, 2 ); |
382 | for( Array<uint16_t>::const_iterator i = aData.begin(); i; i++ ) | 382 | for( Array<uint16_t>::const_iterator i = aData.begin(); i; i++ ) |
383 | { | 383 | { |
384 | iTmp = *i; | 384 | iTmp = *i; |
385 | sOut.write( &iTmp, 2 ); | 385 | sOut.write( &iTmp, 2 ); |
386 | } | 386 | } |
387 | #else | 387 | #else |
388 | uint16_t iTmp = 0xFEFF; // Byte Order Marker | 388 | uint16_t iTmp = 0xFEFF; // Byte Order Marker |
389 | iTmp = (iTmp>>8) | (iTmp<<8); | 389 | iTmp = (iTmp>>8) | (iTmp<<8); |
390 | sOut.write( &iTmp, 2 ); | 390 | sOut.write( &iTmp, 2 ); |
391 | for( Array<uint16_t>::const_iterator i = aData.begin(); i; i++ ) | 391 | for( Array<uint16_t>::const_iterator i = aData.begin(); i; i++ ) |
392 | { | 392 | { |
393 | iTmp = *i; | 393 | iTmp = *i; |
394 | iTmp = (iTmp>>8) | (iTmp<<8); | 394 | iTmp = (iTmp>>8) | (iTmp<<8); |
395 | sOut.write( &iTmp, 2 ); | 395 | sOut.write( &iTmp, 2 ); |
396 | } | 396 | } |
397 | #endif | 397 | #endif |
398 | } | 398 | } |
399 | 399 | ||
400 | void Bu::UtfString::writeUtf16le( Bu::Stream &sOut ) const | 400 | void Bu::UtfString::writeUtf16le( Bu::Stream &sOut ) const |
401 | { | 401 | { |
402 | #if BYTE_ORDER == LITTLE_ENDIAN | 402 | #if BYTE_ORDER == LITTLE_ENDIAN |
403 | uint16_t iTmp = 0xFEFF; // Byte Order Marker | 403 | uint16_t iTmp = 0xFEFF; // Byte Order Marker |
404 | sOut.write( &iTmp, 2 ); | 404 | sOut.write( &iTmp, 2 ); |
405 | for( Array<uint16_t>::const_iterator i = aData.begin(); i; i++ ) | 405 | for( Array<uint16_t>::const_iterator i = aData.begin(); i; i++ ) |
406 | { | 406 | { |
407 | iTmp = *i; | 407 | iTmp = *i; |
408 | sOut.write( &iTmp, 2 ); | 408 | sOut.write( &iTmp, 2 ); |
409 | } | 409 | } |
410 | #else | 410 | #else |
411 | uint16_t iTmp = 0xFEFF; // Byte Order Marker | 411 | uint16_t iTmp = 0xFEFF; // Byte Order Marker |
412 | iTmp = (iTmp>>8) | (iTmp<<8); | 412 | iTmp = (iTmp>>8) | (iTmp<<8); |
413 | sOut.write( &iTmp, 2 ); | 413 | sOut.write( &iTmp, 2 ); |
414 | for( Array<uint16_t>::const_iterator i = aData.begin(); i; i++ ) | 414 | for( Array<uint16_t>::const_iterator i = aData.begin(); i; i++ ) |
415 | { | 415 | { |
416 | iTmp = *i; | 416 | iTmp = *i; |
417 | iTmp = (iTmp>>8) | (iTmp<<8); | 417 | iTmp = (iTmp>>8) | (iTmp<<8); |
418 | sOut.write( &iTmp, 2 ); | 418 | sOut.write( &iTmp, 2 ); |
419 | } | 419 | } |
420 | #endif | 420 | #endif |
421 | } | 421 | } |
422 | 422 | ||
423 | void Bu::UtfString::writeUtf32be( Bu::Stream &sOut ) const | 423 | void Bu::UtfString::writeUtf32be( Bu::Stream &sOut ) const |
424 | { | 424 | { |
425 | #if BYTE_ORDER == BIG_ENDIAN | 425 | #if BYTE_ORDER == BIG_ENDIAN |
426 | uint32_t iTmp = 0xFEFF; // Byte Order Marker | 426 | uint32_t iTmp = 0xFEFF; // Byte Order Marker |
427 | sOut.write( &iTmp, 4 ); | 427 | sOut.write( &iTmp, 4 ); |
428 | int i = 0; | 428 | int i = 0; |
429 | while( i < aData.getSize() ) | 429 | while( i < aData.getSize() ) |
430 | { | 430 | { |
431 | iTmp = nextChar( i ); | 431 | iTmp = nextChar( i ); |
432 | sOut.write( &iTmp, 4 ); | 432 | sOut.write( &iTmp, 4 ); |
433 | } | 433 | } |
434 | #else | 434 | #else |
435 | uint32_t iTmp = 0xFEFF; // Byte Order Marker | 435 | uint32_t iTmp = 0xFEFF; // Byte Order Marker |
436 | iTmp = (iTmp>>24)|(iTmp<<24)|((iTmp&0xff0000)>>8)|((iTmp&0xff00)<<8); | 436 | iTmp = (iTmp>>24)|(iTmp<<24)|((iTmp&0xff0000)>>8)|((iTmp&0xff00)<<8); |
437 | sOut.write( &iTmp, 4 ); | 437 | sOut.write( &iTmp, 4 ); |
438 | int i = 0; | 438 | int i = 0; |
439 | while( i < aData.getSize() ) | 439 | while( i < aData.getSize() ) |
440 | { | 440 | { |
441 | iTmp = nextChar( i ); | 441 | iTmp = nextChar( i ); |
442 | iTmp = (iTmp>>24)|(iTmp<<24)|((iTmp&0xff0000)>>8)|((iTmp&0xff00)<<8); | 442 | iTmp = (iTmp>>24)|(iTmp<<24)|((iTmp&0xff0000)>>8)|((iTmp&0xff00)<<8); |
443 | sOut.write( &iTmp, 4 ); | 443 | sOut.write( &iTmp, 4 ); |
444 | } | 444 | } |
445 | #endif | 445 | #endif |
446 | } | 446 | } |
447 | 447 | ||
448 | void Bu::UtfString::writeUtf32le( Bu::Stream &sOut ) const | 448 | void Bu::UtfString::writeUtf32le( Bu::Stream &sOut ) const |
449 | { | 449 | { |
450 | #if BYTE_ORDER == LITTLE_ENDIAN | 450 | #if BYTE_ORDER == LITTLE_ENDIAN |
451 | uint32_t iTmp = 0xFEFF; // Byte Order Marker | 451 | uint32_t iTmp = 0xFEFF; // Byte Order Marker |
452 | sOut.write( &iTmp, 4 ); | 452 | sOut.write( &iTmp, 4 ); |
453 | int i = 0; | 453 | int i = 0; |
454 | while( i < aData.getSize() ) | 454 | while( i < aData.getSize() ) |
455 | { | 455 | { |
456 | iTmp = nextChar( i ); | 456 | iTmp = nextChar( i ); |
457 | sOut.write( &iTmp, 4 ); | 457 | sOut.write( &iTmp, 4 ); |
458 | } | 458 | } |
459 | #else | 459 | #else |
460 | uint32_t iTmp = 0xFEFF; // Byte Order Marker | 460 | uint32_t iTmp = 0xFEFF; // Byte Order Marker |
461 | iTmp = (iTmp>>24)|(iTmp<<24)|((iTmp&0xff0000)>>8)|((iTmp&0xff00)<<8); | 461 | iTmp = (iTmp>>24)|(iTmp<<24)|((iTmp&0xff0000)>>8)|((iTmp&0xff00)<<8); |
462 | sOut.write( &iTmp, 4 ); | 462 | sOut.write( &iTmp, 4 ); |
463 | int i = 0; | 463 | int i = 0; |
464 | while( i < aData.getSize() ) | 464 | while( i < aData.getSize() ) |
465 | { | 465 | { |
466 | iTmp = nextChar( i ); | 466 | iTmp = nextChar( i ); |
467 | iTmp = (iTmp>>24)|(iTmp<<24)|((iTmp&0xff0000)>>8)|((iTmp&0xff00)<<8); | 467 | iTmp = (iTmp>>24)|(iTmp<<24)|((iTmp&0xff0000)>>8)|((iTmp&0xff00)<<8); |
468 | sOut.write( &iTmp, 4 ); | 468 | sOut.write( &iTmp, 4 ); |
469 | } | 469 | } |
470 | #endif | 470 | #endif |
471 | } | 471 | } |
472 | 472 | ||
473 | Bu::UtfChar Bu::UtfString::get( int iIndex ) const | 473 | Bu::UtfChar Bu::UtfString::get( int iIndex ) const |
474 | { | 474 | { |
475 | return nextChar( iIndex ); | 475 | return nextChar( iIndex ); |
476 | } | 476 | } |
477 | 477 | ||
478 | Bu::UtfChar Bu::UtfString::nextChar( int &iIndex ) const | 478 | Bu::UtfChar Bu::UtfString::nextChar( int &iIndex ) const |
479 | { | 479 | { |
480 | Bu::UtfChar i = aData[iIndex++]; | 480 | Bu::UtfChar i = aData[iIndex++]; |
481 | switch( i&0xFC00 ) | 481 | switch( i&0xFC00 ) |
482 | { | 482 | { |
483 | case 0xD800: | 483 | case 0xD800: |
484 | return (((i&0x3FF)<<10) | ((aData[iIndex++]&0x3FF)))+0x10000; | 484 | return (((i&0x3FF)<<10) | ((aData[iIndex++]&0x3FF)))+0x10000; |
485 | 485 | ||
486 | case 0xDC00: | 486 | case 0xDC00: |
487 | return (((aData[iIndex-2]&0x3FF)<<10) | ((i&0x3FF)))+0x10000; | 487 | return (((aData[iIndex-2]&0x3FF)<<10) | ((i&0x3FF)))+0x10000; |
488 | 488 | ||
489 | default: | 489 | default: |
490 | return i; | 490 | return i; |
491 | } | 491 | } |
492 | } | 492 | } |
493 | 493 | ||
494 | bool Bu::UtfString::operator==( const Bu::UtfString &rhs ) const | 494 | bool Bu::UtfString::operator==( const Bu::UtfString &rhs ) const |
495 | { | 495 | { |
496 | return aData == rhs.aData; | 496 | return aData == rhs.aData; |
497 | } | 497 | } |
498 | 498 | ||
499 | Bu::UtfString &Bu::UtfString::operator+=( const Bu::UtfString &rhs ) | 499 | Bu::UtfString &Bu::UtfString::operator+=( const Bu::UtfString &rhs ) |
500 | { | 500 | { |
501 | append( rhs ); | 501 | append( rhs ); |
502 | return *this; | 502 | return *this; |
503 | } | 503 | } |
504 | 504 | ||
505 | Bu::UtfString &Bu::UtfString::operator+=( const UtfChar &rhs ) | 505 | Bu::UtfString &Bu::UtfString::operator+=( const UtfChar &rhs ) |
506 | { | 506 | { |
507 | append( rhs ); | 507 | append( rhs ); |
508 | return *this; | 508 | return *this; |
509 | } | 509 | } |
510 | 510 | ||
511 | Bu::String Bu::UtfString::get( Encoding eEnc ) const | 511 | Bu::String Bu::UtfString::get( Encoding eEnc ) const |
512 | { | 512 | { |
513 | Bu::MemBuf mb; | 513 | Bu::MemBuf mb; |
514 | write( mb, eEnc ); | 514 | write( mb, eEnc ); |
515 | return mb.getString(); | 515 | return mb.getString(); |
516 | } | 516 | } |
517 | 517 | ||
518 | void Bu::UtfString::debug() const | 518 | void Bu::UtfString::debug() const |
519 | { | 519 | { |
520 | sio << "Raw Utf16: "; | 520 | sio << "Raw Utf16: "; |
521 | for( int i = 0; i < aData.getSize(); i++ ) | 521 | for( int i = 0; i < aData.getSize(); i++ ) |
522 | { | 522 | { |
523 | if( i > 0 ) | 523 | if( i > 0 ) |
524 | sio << ", "; | 524 | sio << ", "; |
525 | sio << "0x" << Fmt::hex() << aData[i]; | 525 | sio << "0x" << Fmt::hex() << aData[i]; |
526 | } | 526 | } |
527 | sio << sio.nl; | 527 | sio << sio.nl; |
528 | sio << "Code Points: "; | 528 | sio << "Code Points: "; |
529 | for( int i = 0; i < aData.getSize(); i++ ) | 529 | for( int i = 0; i < aData.getSize(); i++ ) |
530 | { | 530 | { |
531 | if( i > 0 ) | 531 | if( i > 0 ) |
532 | sio << ", "; | 532 | sio << ", "; |
533 | sio << "0x" << Fmt::hex() << nextChar( i ); | 533 | sio << "0x" << Fmt::hex() << nextChar( i ); |
534 | } | 534 | } |
535 | sio << sio.nl; | 535 | sio << sio.nl; |
536 | } | 536 | } |
537 | /* | 537 | /* |
538 | void Bu::UtfString::debugUtf8( const Bu::String &sUtf8 ) | 538 | void Bu::UtfString::debugUtf8( const Bu::String &sUtf8 ) |
539 | { | 539 | { |
540 | static uint8_t lmask[8] = { | 540 | static uint8_t lmask[8] = { |
541 | 0x00, | 541 | 0x00, |
542 | 0x01, | 542 | 0x01, |
543 | 0x03, | 543 | 0x03, |
544 | 0x07, | 544 | 0x07, |
545 | 0x0f, | 545 | 0x0f, |
546 | 0x1f, | 546 | 0x1f, |
547 | 0x3f, | 547 | 0x3f, |
548 | 0x7f | 548 | 0x7f |
549 | }; | 549 | }; |
550 | for( Bu::String::const_iterator i = sUtf8.begin(); i; i++ ) | 550 | for( Bu::String::const_iterator i = sUtf8.begin(); i; i++ ) |
551 | { | 551 | { |
552 | if( i != sUtf8.begin() ) | 552 | if( i != sUtf8.begin() ) |
553 | sio << ", "; | 553 | sio << ", "; |
554 | if( ((int)(uint8_t)*i)&0x80 ) | 554 | if( ((int)(uint8_t)*i)&0x80 ) |
555 | { | 555 | { |
556 | // sio << "Flag byte: " << Bu::Fmt().radix(2).width(8).fill('0') | 556 | // sio << "Flag byte: " << Bu::Fmt().radix(2).width(8).fill('0') |
557 | // << (int)(uint8_t)*i << sio.nl; | 557 | // << (int)(uint8_t)*i << sio.nl; |
558 | int iBytes = 1; | 558 | int iBytes = 1; |
559 | for(; (((uint8_t)(*i))<<iBytes)&0x80; iBytes++ ) { } | 559 | for(; (((uint8_t)(*i))<<iBytes)&0x80; iBytes++ ) { } |
560 | // sio << "iBytes = " << iBytes << sio.nl; | 560 | // sio << "iBytes = " << iBytes << sio.nl; |
561 | Bu::UtfChar uPt = ((*i) & lmask[7-iBytes])<<(6*(iBytes-1)); | 561 | Bu::UtfChar uPt = ((*i) & lmask[7-iBytes])<<(6*(iBytes-1)); |
562 | // sio << "mask: " << Bu::Fmt().radix(2).width(8).fill('0') | 562 | // sio << "mask: " << Bu::Fmt().radix(2).width(8).fill('0') |
563 | // << (int)lmask[7-iBytes] << sio.nl; | 563 | // << (int)lmask[7-iBytes] << sio.nl; |
564 | for( iBytes--; iBytes >= 1; iBytes-- ) | 564 | for( iBytes--; iBytes >= 1; iBytes-- ) |
565 | { | 565 | { |
566 | // sio << "iBytes = " << iBytes << ", shift = " << (6*(iBytes-1)) | 566 | // sio << "iBytes = " << iBytes << ", shift = " << (6*(iBytes-1)) |
567 | // << sio.nl; | 567 | // << sio.nl; |
568 | // sio << "next: " << Bu::Fmt().radix(2).width(8).fill('0') | 568 | // sio << "next: " << Bu::Fmt().radix(2).width(8).fill('0') |
569 | // << (int)(uint8_t)*i << sio.nl | 569 | // << (int)(uint8_t)*i << sio.nl |
570 | // << "mask: " << Bu::Fmt().radix(2).width(8).fill('0') | 570 | // << "mask: " << Bu::Fmt().radix(2).width(8).fill('0') |
571 | // << (int)lmask[6] << sio.nl; | 571 | // << (int)lmask[6] << sio.nl; |
572 | i++; | 572 | i++; |
573 | uPt |= ((*i)&lmask[6])<<(6*(iBytes-1)); | 573 | uPt |= ((*i)&lmask[6])<<(6*(iBytes-1)); |
574 | } | 574 | } |
575 | sio << uPt; | 575 | sio << uPt; |
576 | // sio << " (" << Bu::Fmt( 8, 2 ).fill('0') | 576 | // sio << " (" << Bu::Fmt( 8, 2 ).fill('0') |
577 | // << uPt << ")"; | 577 | // << uPt << ")"; |
578 | } | 578 | } |
579 | else | 579 | else |
580 | { | 580 | { |
581 | sio << (int)((uint8_t)*i); | 581 | sio << (int)((uint8_t)*i); |
582 | } | 582 | } |
583 | } | 583 | } |
584 | sio << sio.nl; | 584 | sio << sio.nl; |
585 | } | 585 | } |
586 | */ | 586 | */ |
587 | 587 | ||
588 | template<> uint32_t Bu::__calcHashCode<Bu::UtfString>( const Bu::UtfString &k ) | 588 | template<> uint32_t Bu::__calcHashCode<Bu::UtfString>( const Bu::UtfString &k ) |
589 | { | 589 | { |
590 | uint32_t uCode = 0; | 590 | uint32_t uCode = 0; |
591 | 591 | ||
592 | for( Bu::UtfString::const_iterator i = k.begin(); i; i++ ) | 592 | for( Bu::UtfString::const_iterator i = k.begin(); i; i++ ) |
593 | { | 593 | { |
594 | uCode = *i + (uCode<<6) + (uCode<<16) - uCode; | 594 | uCode = *i + (uCode<<6) + (uCode<<16) - uCode; |
595 | } | 595 | } |
596 | 596 | ||
597 | return uCode; | 597 | return uCode; |
598 | } | 598 | } |
599 | 599 | ||
600 | template<> bool Bu::__cmpHashKeys<Bu::UtfString>( | 600 | template<> bool Bu::__cmpHashKeys<Bu::UtfString>( |
601 | const Bu::UtfString &a, const Bu::UtfString &b ) | 601 | const Bu::UtfString &a, const Bu::UtfString &b ) |
602 | { | 602 | { |
603 | return a == b; | 603 | return a == b; |
604 | } | 604 | } |