diff options
author | Mike Buland <eichlan@xagasoft.com> | 2011-04-04 14:59:13 +0000 |
---|---|---|
committer | Mike Buland <eichlan@xagasoft.com> | 2011-04-04 14:59:13 +0000 |
commit | 6ff23ac5097f7d92ac8840c2ba17d1dbf1eb80a5 (patch) | |
tree | fc70404d66854bba713bff2350f5f69f43bd85bc /src/utfstring.cpp | |
parent | abbf45c1da7f3e3a542e6c6339a1bab31283f22e (diff) | |
download | libbu++-6ff23ac5097f7d92ac8840c2ba17d1dbf1eb80a5.tar.gz libbu++-6ff23ac5097f7d92ac8840c2ba17d1dbf1eb80a5.tar.bz2 libbu++-6ff23ac5097f7d92ac8840c2ba17d1dbf1eb80a5.tar.xz libbu++-6ff23ac5097f7d92ac8840c2ba17d1dbf1eb80a5.zip |
UtfString is going really well. It can now parse Utf8, Utf16 (le,be), and
Utf32 (le,be). The internal storage seems to be working fine, although we do
have a problem with random access, but at least we can tell which half of a
surrogate pair we're on, so we can always rapidly determine the entire code
point from any utf16 index that we're on.
The only optomization that I'm not doing yet is reading in entire 16bit or 32bit
words at a time and converting them from their byte order to native. There are
a few potential issues with that, so we'll see.
I added a couple of testing datafiles and a test program, I'll delete them all
just as soon as it's verified to write correctly.
Diffstat (limited to 'src/utfstring.cpp')
-rw-r--r-- | src/utfstring.cpp | 240 |
1 files changed, 225 insertions, 15 deletions
diff --git a/src/utfstring.cpp b/src/utfstring.cpp index bb0a011..7c4ba19 100644 --- a/src/utfstring.cpp +++ b/src/utfstring.cpp | |||
@@ -8,9 +8,13 @@ | |||
8 | #include "bu/utfstring.h" | 8 | #include "bu/utfstring.h" |
9 | 9 | ||
10 | #include "bu/string.h" | 10 | #include "bu/string.h" |
11 | #include "bu/stream.h" | ||
11 | 12 | ||
12 | #include <endian.h> | 13 | #include <endian.h> |
13 | 14 | ||
15 | #include "bu/sio.h" | ||
16 | using Bu::sio; | ||
17 | |||
14 | Bu::UtfString::UtfString() | 18 | Bu::UtfString::UtfString() |
15 | { | 19 | { |
16 | } | 20 | } |
@@ -33,20 +37,35 @@ void Bu::UtfString::set( const Bu::String &sInput, Encoding eEnc ) | |||
33 | break; | 37 | break; |
34 | 38 | ||
35 | case Utf16: | 39 | case Utf16: |
36 | case Utf16be: | ||
37 | setUtf16( sInput ); | 40 | setUtf16( sInput ); |
38 | break; | 41 | break; |
39 | 42 | ||
43 | case Utf16be: | ||
44 | setUtf16be( sInput ); | ||
45 | break; | ||
46 | |||
40 | case Utf16le: | 47 | case Utf16le: |
41 | throw Bu::ExceptionBase("Utf16le not supported yet."); | 48 | setUtf16le( sInput ); |
42 | break; | 49 | break; |
43 | 50 | ||
44 | case Utf32: | 51 | case Utf32: |
45 | throw Bu::ExceptionBase("Utf32 not supported yet."); | 52 | setUtf32( sInput ); |
53 | break; | ||
54 | |||
55 | case Utf32be: | ||
56 | setUtf32be( sInput ); | ||
57 | break; | ||
58 | |||
59 | case Utf32le: | ||
60 | setUtf32le( sInput ); | ||
61 | break; | ||
62 | |||
63 | case Ucs2: | ||
64 | throw Bu::ExceptionBase("Ucs2 not supported yet."); | ||
46 | break; | 65 | break; |
47 | 66 | ||
48 | case Ucs16: | 67 | case Ucs4: |
49 | throw Bu::ExceptionBase("Ucs16 not supported yet."); | 68 | throw Bu::ExceptionBase("Ucs4 not supported yet."); |
50 | break; | 69 | break; |
51 | 70 | ||
52 | case GuessEncoding: | 71 | case GuessEncoding: |
@@ -104,8 +123,32 @@ void Bu::UtfString::setUtf8( const Bu::String &sInput ) | |||
104 | 123 | ||
105 | void Bu::UtfString::setUtf16( const Bu::String &sInput ) | 124 | void Bu::UtfString::setUtf16( const Bu::String &sInput ) |
106 | { | 125 | { |
126 | Bu::String::const_iterator i = sInput.begin(); | ||
127 | if( (uint8_t)*sInput.begin() == 0xFF && | ||
128 | (uint8_t)*(sInput.begin()+1) == 0xFE ) | ||
129 | { | ||
130 | setUtf16le( sInput ); | ||
131 | return; | ||
132 | } | ||
133 | setUtf16be( sInput ); | ||
134 | } | ||
135 | |||
136 | void Bu::UtfString::setUtf16be( const Bu::String &sInput ) | ||
137 | { | ||
138 | Bu::String::const_iterator i = sInput.begin(); | ||
139 | if( (uint8_t)*sInput.begin() == 0xFE && | ||
140 | (uint8_t)*(sInput.begin()+1) == 0xFF ) | ||
141 | |||
142 | { | ||
143 | i += 2; | ||
144 | sio << "Verified big endian." << sio.nl; | ||
145 | } | ||
146 | else | ||
147 | { | ||
148 | sio << "Assuming big endian." << sio.nl; | ||
149 | } | ||
107 | uint16_t hi, lo; | 150 | uint16_t hi, lo; |
108 | for( Bu::String::const_iterator i = sInput.begin(); i; i++ ) | 151 | for( ; i; i++ ) |
109 | { | 152 | { |
110 | hi = (((uint8_t)*i)<<8) | ((uint8_t)*(++i)); | 153 | hi = (((uint8_t)*i)<<8) | ((uint8_t)*(++i)); |
111 | append16( hi ); | 154 | append16( hi ); |
@@ -117,25 +160,192 @@ void Bu::UtfString::setUtf16( const Bu::String &sInput ) | |||
117 | } | 160 | } |
118 | } | 161 | } |
119 | 162 | ||
120 | #include "bu/sio.h" | 163 | void Bu::UtfString::setUtf16le( const Bu::String &sInput ) |
121 | using Bu::sio; | 164 | { |
165 | Bu::String::const_iterator i = sInput.begin(); | ||
166 | if( (uint8_t)*sInput.begin() == 0xFF && | ||
167 | (uint8_t)*(sInput.begin()+1) == 0xFE ) | ||
168 | { | ||
169 | i += 2; | ||
170 | sio << "Verified little endian." << sio.nl; | ||
171 | } | ||
172 | else | ||
173 | { | ||
174 | sio << "Assuming little endian." << sio.nl; | ||
175 | } | ||
176 | uint16_t hi, lo; | ||
177 | for( ; i; i++ ) | ||
178 | { | ||
179 | hi = (((uint8_t)*i)) | ((uint8_t)*(++i)<<8); | ||
180 | append16( hi ); | ||
181 | if( (hi&0xD800u) == 0xD800u ) | ||
182 | { | ||
183 | lo = (((uint8_t)*(++i))) | ((uint8_t)*(++i)<<8); | ||
184 | append16( lo ); | ||
185 | } | ||
186 | } | ||
187 | } | ||
188 | |||
189 | void Bu::UtfString::setUtf32( const Bu::String &sInput ) | ||
190 | { | ||
191 | Bu::String::const_iterator i = sInput.begin(); | ||
192 | if( (uint8_t)*i == 0x00 && | ||
193 | (uint8_t)*(++i) == 0x00 && | ||
194 | (uint8_t)*(++i) == 0xFF && | ||
195 | (uint8_t)*(++i) == 0xFE ) | ||
196 | { | ||
197 | setUtf32le( sInput ); | ||
198 | return; | ||
199 | } | ||
200 | setUtf32be( sInput ); | ||
201 | } | ||
202 | |||
203 | void Bu::UtfString::setUtf32be( const Bu::String &sInput ) | ||
204 | { | ||
205 | Bu::String::const_iterator i = sInput.begin(); | ||
206 | if( (uint8_t)*i == 0x00 && | ||
207 | (uint8_t)*(++i) == 0x00 && | ||
208 | (uint8_t)*(++i) == 0xFE && | ||
209 | (uint8_t)*(++i) == 0xFF ) | ||
210 | { | ||
211 | i++; | ||
212 | sio << "Verified big endian." << sio.nl; | ||
213 | } | ||
214 | else | ||
215 | { | ||
216 | i = sInput.begin(); | ||
217 | sio << "Assuming big endian." << sio.nl; | ||
218 | } | ||
219 | for( ; i; i++ ) | ||
220 | { | ||
221 | append( (((uint8_t)*i)<<24) | | ||
222 | (((uint8_t)*(++i))<<16) | | ||
223 | (((uint8_t)*(++i))<<8) | | ||
224 | ((uint8_t)*(++i)) | ||
225 | ); | ||
226 | } | ||
227 | } | ||
228 | |||
229 | void Bu::UtfString::setUtf32le( const Bu::String &sInput ) | ||
230 | { | ||
231 | Bu::String::const_iterator i = sInput.begin(); | ||
232 | if( (uint8_t)*i == 0x00 && | ||
233 | (uint8_t)*(++i) == 0x00 && | ||
234 | (uint8_t)*(++i) == 0xFF && | ||
235 | (uint8_t)*(++i) == 0xFE ) | ||
236 | { | ||
237 | i++; | ||
238 | sio << "Verified little endian." << sio.nl; | ||
239 | } | ||
240 | else | ||
241 | { | ||
242 | i = sInput.begin(); | ||
243 | sio << "Assuming little endian." << sio.nl; | ||
244 | } | ||
245 | for( ; i; i++ ) | ||
246 | { | ||
247 | append( ((uint8_t)*i) | | ||
248 | (((uint8_t)*(++i))<<8) | | ||
249 | (((uint8_t)*(++i))<<16) | | ||
250 | (((uint8_t)*(++i))<<24) | ||
251 | ); | ||
252 | } | ||
253 | } | ||
254 | |||
255 | void Bu::UtfString::write( Bu::Stream &sOut, Encoding eEnc ) | ||
256 | { | ||
257 | switch( eEnc ) | ||
258 | { | ||
259 | case Utf8: | ||
260 | writeUtf8( sOut ); | ||
261 | break; | ||
262 | |||
263 | case Utf16: | ||
264 | writeUtf16( sOut ); | ||
265 | break; | ||
266 | |||
267 | case Utf16be: | ||
268 | writeUtf16be( sOut ); | ||
269 | break; | ||
270 | |||
271 | case Utf16le: | ||
272 | writeUtf16le( sOut ); | ||
273 | break; | ||
274 | |||
275 | case Utf32: | ||
276 | writeUtf32( sOut ); | ||
277 | break; | ||
278 | |||
279 | case Utf32be: | ||
280 | writeUtf32be( sOut ); | ||
281 | break; | ||
282 | |||
283 | case Utf32le: | ||
284 | writeUtf32le( sOut ); | ||
285 | break; | ||
286 | |||
287 | case Ucs2: | ||
288 | throw Bu::ExceptionBase("Ucs2 not supported yet."); | ||
289 | break; | ||
290 | |||
291 | case Ucs4: | ||
292 | throw Bu::ExceptionBase("Ucs4 not supported yet."); | ||
293 | break; | ||
294 | |||
295 | case GuessEncoding: | ||
296 | throw Bu::ExceptionBase( | ||
297 | "GuessEncoding is incompatible with encoding."); | ||
298 | break; | ||
299 | |||
300 | } | ||
301 | } | ||
302 | |||
303 | void Bu::UtfString::writeUtf8( Bu::Stream &sOut ) | ||
304 | { | ||
305 | } | ||
306 | |||
307 | void Bu::UtfString::writeUtf16( Bu::Stream &sOut ) | ||
308 | { | ||
309 | } | ||
310 | |||
311 | void Bu::UtfString::writeUtf16be( Bu::Stream &sOut ) | ||
312 | { | ||
313 | } | ||
314 | |||
315 | void Bu::UtfString::writeUtf16le( Bu::Stream &sOut ) | ||
316 | { | ||
317 | } | ||
318 | |||
319 | void Bu::UtfString::writeUtf32( Bu::Stream &sOut ) | ||
320 | { | ||
321 | } | ||
322 | |||
323 | void Bu::UtfString::writeUtf32be( Bu::Stream &sOut ) | ||
324 | { | ||
325 | } | ||
326 | |||
327 | void Bu::UtfString::writeUtf32le( Bu::Stream &sOut ) | ||
328 | { | ||
329 | } | ||
122 | 330 | ||
123 | Bu::UtfChar Bu::UtfString::get( int iIndex ) | 331 | Bu::UtfChar Bu::UtfString::get( int iIndex ) |
124 | { | 332 | { |
125 | Bu::UtfChar i = aData[iIndex]; | 333 | return nextChar( iIndex ); |
334 | } | ||
335 | |||
336 | Bu::UtfChar Bu::UtfString::nextChar( int &iIndex ) | ||
337 | { | ||
338 | Bu::UtfChar i = aData[iIndex++]; | ||
126 | switch( i&0xFC00 ) | 339 | switch( i&0xFC00 ) |
127 | { | 340 | { |
128 | case 0xD800: | 341 | case 0xD800: |
129 | sio << "(hi) "; | 342 | return (((i&0x3FF)<<10) | ((aData[iIndex++]&0x3FF)))+0x10000; |
130 | return (((i&0x3FF)<<10) | ((aData[iIndex+1]&0x3FF)))+0x10000; | ||
131 | 343 | ||
132 | case 0xDC00: | 344 | case 0xDC00: |
133 | sio << "(lo) "; | 345 | return (((aData[iIndex-2]&0x3FF)<<10) | ((i&0x3FF)))+0x10000; |
134 | return 0; | ||
135 | 346 | ||
136 | default: | 347 | default: |
137 | sio << "(--) "; | 348 | return i; |
138 | return i&0xFC00; | ||
139 | } | 349 | } |
140 | } | 350 | } |
141 | 351 | ||