diff options
Diffstat (limited to '')
-rw-r--r-- | src/utfstring.cpp | 240 |
1 files changed, 225 insertions, 15 deletions
diff --git a/src/utfstring.cpp b/src/utfstring.cpp index bb0a011..7c4ba19 100644 --- a/src/utfstring.cpp +++ b/src/utfstring.cpp | |||
@@ -8,9 +8,13 @@ | |||
8 | #include "bu/utfstring.h" | 8 | #include "bu/utfstring.h" |
9 | 9 | ||
10 | #include "bu/string.h" | 10 | #include "bu/string.h" |
11 | #include "bu/stream.h" | ||
11 | 12 | ||
12 | #include <endian.h> | 13 | #include <endian.h> |
13 | 14 | ||
15 | #include "bu/sio.h" | ||
16 | using Bu::sio; | ||
17 | |||
14 | Bu::UtfString::UtfString() | 18 | Bu::UtfString::UtfString() |
15 | { | 19 | { |
16 | } | 20 | } |
@@ -33,20 +37,35 @@ void Bu::UtfString::set( const Bu::String &sInput, Encoding eEnc ) | |||
33 | break; | 37 | break; |
34 | 38 | ||
35 | case Utf16: | 39 | case Utf16: |
36 | case Utf16be: | ||
37 | setUtf16( sInput ); | 40 | setUtf16( sInput ); |
38 | break; | 41 | break; |
39 | 42 | ||
43 | case Utf16be: | ||
44 | setUtf16be( sInput ); | ||
45 | break; | ||
46 | |||
40 | case Utf16le: | 47 | case Utf16le: |
41 | throw Bu::ExceptionBase("Utf16le not supported yet."); | 48 | setUtf16le( sInput ); |
42 | break; | 49 | break; |
43 | 50 | ||
44 | case Utf32: | 51 | case Utf32: |
45 | throw Bu::ExceptionBase("Utf32 not supported yet."); | 52 | setUtf32( sInput ); |
53 | break; | ||
54 | |||
55 | case Utf32be: | ||
56 | setUtf32be( sInput ); | ||
57 | break; | ||
58 | |||
59 | case Utf32le: | ||
60 | setUtf32le( sInput ); | ||
61 | break; | ||
62 | |||
63 | case Ucs2: | ||
64 | throw Bu::ExceptionBase("Ucs2 not supported yet."); | ||
46 | break; | 65 | break; |
47 | 66 | ||
48 | case Ucs16: | 67 | case Ucs4: |
49 | throw Bu::ExceptionBase("Ucs16 not supported yet."); | 68 | throw Bu::ExceptionBase("Ucs4 not supported yet."); |
50 | break; | 69 | break; |
51 | 70 | ||
52 | case GuessEncoding: | 71 | case GuessEncoding: |
@@ -104,8 +123,32 @@ void Bu::UtfString::setUtf8( const Bu::String &sInput ) | |||
104 | 123 | ||
105 | void Bu::UtfString::setUtf16( const Bu::String &sInput ) | 124 | void Bu::UtfString::setUtf16( const Bu::String &sInput ) |
106 | { | 125 | { |
126 | Bu::String::const_iterator i = sInput.begin(); | ||
127 | if( (uint8_t)*sInput.begin() == 0xFF && | ||
128 | (uint8_t)*(sInput.begin()+1) == 0xFE ) | ||
129 | { | ||
130 | setUtf16le( sInput ); | ||
131 | return; | ||
132 | } | ||
133 | setUtf16be( sInput ); | ||
134 | } | ||
135 | |||
136 | void Bu::UtfString::setUtf16be( const Bu::String &sInput ) | ||
137 | { | ||
138 | Bu::String::const_iterator i = sInput.begin(); | ||
139 | if( (uint8_t)*sInput.begin() == 0xFE && | ||
140 | (uint8_t)*(sInput.begin()+1) == 0xFF ) | ||
141 | |||
142 | { | ||
143 | i += 2; | ||
144 | sio << "Verified big endian." << sio.nl; | ||
145 | } | ||
146 | else | ||
147 | { | ||
148 | sio << "Assuming big endian." << sio.nl; | ||
149 | } | ||
107 | uint16_t hi, lo; | 150 | uint16_t hi, lo; |
108 | for( Bu::String::const_iterator i = sInput.begin(); i; i++ ) | 151 | for( ; i; i++ ) |
109 | { | 152 | { |
110 | hi = (((uint8_t)*i)<<8) | ((uint8_t)*(++i)); | 153 | hi = (((uint8_t)*i)<<8) | ((uint8_t)*(++i)); |
111 | append16( hi ); | 154 | append16( hi ); |
@@ -117,25 +160,192 @@ void Bu::UtfString::setUtf16( const Bu::String &sInput ) | |||
117 | } | 160 | } |
118 | } | 161 | } |
119 | 162 | ||
120 | #include "bu/sio.h" | 163 | void Bu::UtfString::setUtf16le( const Bu::String &sInput ) |
121 | using Bu::sio; | 164 | { |
165 | Bu::String::const_iterator i = sInput.begin(); | ||
166 | if( (uint8_t)*sInput.begin() == 0xFF && | ||
167 | (uint8_t)*(sInput.begin()+1) == 0xFE ) | ||
168 | { | ||
169 | i += 2; | ||
170 | sio << "Verified little endian." << sio.nl; | ||
171 | } | ||
172 | else | ||
173 | { | ||
174 | sio << "Assuming little endian." << sio.nl; | ||
175 | } | ||
176 | uint16_t hi, lo; | ||
177 | for( ; i; i++ ) | ||
178 | { | ||
179 | hi = (((uint8_t)*i)) | ((uint8_t)*(++i)<<8); | ||
180 | append16( hi ); | ||
181 | if( (hi&0xD800u) == 0xD800u ) | ||
182 | { | ||
183 | lo = (((uint8_t)*(++i))) | ((uint8_t)*(++i)<<8); | ||
184 | append16( lo ); | ||
185 | } | ||
186 | } | ||
187 | } | ||
188 | |||
189 | void Bu::UtfString::setUtf32( const Bu::String &sInput ) | ||
190 | { | ||
191 | Bu::String::const_iterator i = sInput.begin(); | ||
192 | if( (uint8_t)*i == 0x00 && | ||
193 | (uint8_t)*(++i) == 0x00 && | ||
194 | (uint8_t)*(++i) == 0xFF && | ||
195 | (uint8_t)*(++i) == 0xFE ) | ||
196 | { | ||
197 | setUtf32le( sInput ); | ||
198 | return; | ||
199 | } | ||
200 | setUtf32be( sInput ); | ||
201 | } | ||
202 | |||
203 | void Bu::UtfString::setUtf32be( const Bu::String &sInput ) | ||
204 | { | ||
205 | Bu::String::const_iterator i = sInput.begin(); | ||
206 | if( (uint8_t)*i == 0x00 && | ||
207 | (uint8_t)*(++i) == 0x00 && | ||
208 | (uint8_t)*(++i) == 0xFE && | ||
209 | (uint8_t)*(++i) == 0xFF ) | ||
210 | { | ||
211 | i++; | ||
212 | sio << "Verified big endian." << sio.nl; | ||
213 | } | ||
214 | else | ||
215 | { | ||
216 | i = sInput.begin(); | ||
217 | sio << "Assuming big endian." << sio.nl; | ||
218 | } | ||
219 | for( ; i; i++ ) | ||
220 | { | ||
221 | append( (((uint8_t)*i)<<24) | | ||
222 | (((uint8_t)*(++i))<<16) | | ||
223 | (((uint8_t)*(++i))<<8) | | ||
224 | ((uint8_t)*(++i)) | ||
225 | ); | ||
226 | } | ||
227 | } | ||
228 | |||
229 | void Bu::UtfString::setUtf32le( const Bu::String &sInput ) | ||
230 | { | ||
231 | Bu::String::const_iterator i = sInput.begin(); | ||
232 | if( (uint8_t)*i == 0x00 && | ||
233 | (uint8_t)*(++i) == 0x00 && | ||
234 | (uint8_t)*(++i) == 0xFF && | ||
235 | (uint8_t)*(++i) == 0xFE ) | ||
236 | { | ||
237 | i++; | ||
238 | sio << "Verified little endian." << sio.nl; | ||
239 | } | ||
240 | else | ||
241 | { | ||
242 | i = sInput.begin(); | ||
243 | sio << "Assuming little endian." << sio.nl; | ||
244 | } | ||
245 | for( ; i; i++ ) | ||
246 | { | ||
247 | append( ((uint8_t)*i) | | ||
248 | (((uint8_t)*(++i))<<8) | | ||
249 | (((uint8_t)*(++i))<<16) | | ||
250 | (((uint8_t)*(++i))<<24) | ||
251 | ); | ||
252 | } | ||
253 | } | ||
254 | |||
255 | void Bu::UtfString::write( Bu::Stream &sOut, Encoding eEnc ) | ||
256 | { | ||
257 | switch( eEnc ) | ||
258 | { | ||
259 | case Utf8: | ||
260 | writeUtf8( sOut ); | ||
261 | break; | ||
262 | |||
263 | case Utf16: | ||
264 | writeUtf16( sOut ); | ||
265 | break; | ||
266 | |||
267 | case Utf16be: | ||
268 | writeUtf16be( sOut ); | ||
269 | break; | ||
270 | |||
271 | case Utf16le: | ||
272 | writeUtf16le( sOut ); | ||
273 | break; | ||
274 | |||
275 | case Utf32: | ||
276 | writeUtf32( sOut ); | ||
277 | break; | ||
278 | |||
279 | case Utf32be: | ||
280 | writeUtf32be( sOut ); | ||
281 | break; | ||
282 | |||
283 | case Utf32le: | ||
284 | writeUtf32le( sOut ); | ||
285 | break; | ||
286 | |||
287 | case Ucs2: | ||
288 | throw Bu::ExceptionBase("Ucs2 not supported yet."); | ||
289 | break; | ||
290 | |||
291 | case Ucs4: | ||
292 | throw Bu::ExceptionBase("Ucs4 not supported yet."); | ||
293 | break; | ||
294 | |||
295 | case GuessEncoding: | ||
296 | throw Bu::ExceptionBase( | ||
297 | "GuessEncoding is incompatible with encoding."); | ||
298 | break; | ||
299 | |||
300 | } | ||
301 | } | ||
302 | |||
303 | void Bu::UtfString::writeUtf8( Bu::Stream &sOut ) | ||
304 | { | ||
305 | } | ||
306 | |||
307 | void Bu::UtfString::writeUtf16( Bu::Stream &sOut ) | ||
308 | { | ||
309 | } | ||
310 | |||
311 | void Bu::UtfString::writeUtf16be( Bu::Stream &sOut ) | ||
312 | { | ||
313 | } | ||
314 | |||
315 | void Bu::UtfString::writeUtf16le( Bu::Stream &sOut ) | ||
316 | { | ||
317 | } | ||
318 | |||
319 | void Bu::UtfString::writeUtf32( Bu::Stream &sOut ) | ||
320 | { | ||
321 | } | ||
322 | |||
323 | void Bu::UtfString::writeUtf32be( Bu::Stream &sOut ) | ||
324 | { | ||
325 | } | ||
326 | |||
327 | void Bu::UtfString::writeUtf32le( Bu::Stream &sOut ) | ||
328 | { | ||
329 | } | ||
122 | 330 | ||
123 | Bu::UtfChar Bu::UtfString::get( int iIndex ) | 331 | Bu::UtfChar Bu::UtfString::get( int iIndex ) |
124 | { | 332 | { |
125 | Bu::UtfChar i = aData[iIndex]; | 333 | return nextChar( iIndex ); |
334 | } | ||
335 | |||
336 | Bu::UtfChar Bu::UtfString::nextChar( int &iIndex ) | ||
337 | { | ||
338 | Bu::UtfChar i = aData[iIndex++]; | ||
126 | switch( i&0xFC00 ) | 339 | switch( i&0xFC00 ) |
127 | { | 340 | { |
128 | case 0xD800: | 341 | case 0xD800: |
129 | sio << "(hi) "; | 342 | return (((i&0x3FF)<<10) | ((aData[iIndex++]&0x3FF)))+0x10000; |
130 | return (((i&0x3FF)<<10) | ((aData[iIndex+1]&0x3FF)))+0x10000; | ||
131 | 343 | ||
132 | case 0xDC00: | 344 | case 0xDC00: |
133 | sio << "(lo) "; | 345 | return (((aData[iIndex-2]&0x3FF)<<10) | ((i&0x3FF)))+0x10000; |
134 | return 0; | ||
135 | 346 | ||
136 | default: | 347 | default: |
137 | sio << "(--) "; | 348 | return i; |
138 | return i&0xFC00; | ||
139 | } | 349 | } |
140 | } | 350 | } |
141 | 351 | ||