summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMike Buland <eichlan@xagasoft.com>2011-04-04 14:59:13 +0000
committerMike Buland <eichlan@xagasoft.com>2011-04-04 14:59:13 +0000
commit6ff23ac5097f7d92ac8840c2ba17d1dbf1eb80a5 (patch)
treefc70404d66854bba713bff2350f5f69f43bd85bc
parentabbf45c1da7f3e3a542e6c6339a1bab31283f22e (diff)
downloadlibbu++-6ff23ac5097f7d92ac8840c2ba17d1dbf1eb80a5.tar.gz
libbu++-6ff23ac5097f7d92ac8840c2ba17d1dbf1eb80a5.tar.bz2
libbu++-6ff23ac5097f7d92ac8840c2ba17d1dbf1eb80a5.tar.xz
libbu++-6ff23ac5097f7d92ac8840c2ba17d1dbf1eb80a5.zip
UtfString is going really well. It can now parse Utf8, Utf16 (le,be), and
Utf32 (le,be). The internal storage seems to be working fine, although we do have a problem with random access, but at least we can tell which half of a surrogate pair we're on, so we can always rapidly determine the entire code point from any utf16 index that we're on. The only optomization that I'm not doing yet is reading in entire 16bit or 32bit words at a time and converting them from their byte order to native. There are a few potential issues with that, so we'll see. I added a couple of testing datafiles and a test program, I'll delete them all just as soon as it's verified to write correctly.
-rw-r--r--src/utfstring.cpp240
-rw-r--r--src/utfstring.h52
-rw-r--r--test.utf161
-rw-r--r--test.utf16be1
-rw-r--r--test.utf16le1
-rw-r--r--utf16.cpp42
6 files changed, 319 insertions, 18 deletions
diff --git a/src/utfstring.cpp b/src/utfstring.cpp
index bb0a011..7c4ba19 100644
--- a/src/utfstring.cpp
+++ b/src/utfstring.cpp
@@ -8,9 +8,13 @@
8#include "bu/utfstring.h" 8#include "bu/utfstring.h"
9 9
10#include "bu/string.h" 10#include "bu/string.h"
11#include "bu/stream.h"
11 12
12#include <endian.h> 13#include <endian.h>
13 14
15#include "bu/sio.h"
16using Bu::sio;
17
14Bu::UtfString::UtfString() 18Bu::UtfString::UtfString()
15{ 19{
16} 20}
@@ -33,20 +37,35 @@ void Bu::UtfString::set( const Bu::String &sInput, Encoding eEnc )
33 break; 37 break;
34 38
35 case Utf16: 39 case Utf16:
36 case Utf16be:
37 setUtf16( sInput ); 40 setUtf16( sInput );
38 break; 41 break;
39 42
43 case Utf16be:
44 setUtf16be( sInput );
45 break;
46
40 case Utf16le: 47 case Utf16le:
41 throw Bu::ExceptionBase("Utf16le not supported yet."); 48 setUtf16le( sInput );
42 break; 49 break;
43 50
44 case Utf32: 51 case Utf32:
45 throw Bu::ExceptionBase("Utf32 not supported yet."); 52 setUtf32( sInput );
53 break;
54
55 case Utf32be:
56 setUtf32be( sInput );
57 break;
58
59 case Utf32le:
60 setUtf32le( sInput );
61 break;
62
63 case Ucs2:
64 throw Bu::ExceptionBase("Ucs2 not supported yet.");
46 break; 65 break;
47 66
48 case Ucs16: 67 case Ucs4:
49 throw Bu::ExceptionBase("Ucs16 not supported yet."); 68 throw Bu::ExceptionBase("Ucs4 not supported yet.");
50 break; 69 break;
51 70
52 case GuessEncoding: 71 case GuessEncoding:
@@ -104,8 +123,32 @@ void Bu::UtfString::setUtf8( const Bu::String &sInput )
104 123
105void Bu::UtfString::setUtf16( const Bu::String &sInput ) 124void Bu::UtfString::setUtf16( const Bu::String &sInput )
106{ 125{
126 Bu::String::const_iterator i = sInput.begin();
127 if( (uint8_t)*sInput.begin() == 0xFF &&
128 (uint8_t)*(sInput.begin()+1) == 0xFE )
129 {
130 setUtf16le( sInput );
131 return;
132 }
133 setUtf16be( sInput );
134}
135
136void Bu::UtfString::setUtf16be( const Bu::String &sInput )
137{
138 Bu::String::const_iterator i = sInput.begin();
139 if( (uint8_t)*sInput.begin() == 0xFE &&
140 (uint8_t)*(sInput.begin()+1) == 0xFF )
141
142 {
143 i += 2;
144 sio << "Verified big endian." << sio.nl;
145 }
146 else
147 {
148 sio << "Assuming big endian." << sio.nl;
149 }
107 uint16_t hi, lo; 150 uint16_t hi, lo;
108 for( Bu::String::const_iterator i = sInput.begin(); i; i++ ) 151 for( ; i; i++ )
109 { 152 {
110 hi = (((uint8_t)*i)<<8) | ((uint8_t)*(++i)); 153 hi = (((uint8_t)*i)<<8) | ((uint8_t)*(++i));
111 append16( hi ); 154 append16( hi );
@@ -117,25 +160,192 @@ void Bu::UtfString::setUtf16( const Bu::String &sInput )
117 } 160 }
118} 161}
119 162
120#include "bu/sio.h" 163void Bu::UtfString::setUtf16le( const Bu::String &sInput )
121using Bu::sio; 164{
165 Bu::String::const_iterator i = sInput.begin();
166 if( (uint8_t)*sInput.begin() == 0xFF &&
167 (uint8_t)*(sInput.begin()+1) == 0xFE )
168 {
169 i += 2;
170 sio << "Verified little endian." << sio.nl;
171 }
172 else
173 {
174 sio << "Assuming little endian." << sio.nl;
175 }
176 uint16_t hi, lo;
177 for( ; i; i++ )
178 {
179 hi = (((uint8_t)*i)) | ((uint8_t)*(++i)<<8);
180 append16( hi );
181 if( (hi&0xD800u) == 0xD800u )
182 {
183 lo = (((uint8_t)*(++i))) | ((uint8_t)*(++i)<<8);
184 append16( lo );
185 }
186 }
187}
188
189void Bu::UtfString::setUtf32( const Bu::String &sInput )
190{
191 Bu::String::const_iterator i = sInput.begin();
192 if( (uint8_t)*i == 0x00 &&
193 (uint8_t)*(++i) == 0x00 &&
194 (uint8_t)*(++i) == 0xFF &&
195 (uint8_t)*(++i) == 0xFE )
196 {
197 setUtf32le( sInput );
198 return;
199 }
200 setUtf32be( sInput );
201}
202
203void Bu::UtfString::setUtf32be( const Bu::String &sInput )
204{
205 Bu::String::const_iterator i = sInput.begin();
206 if( (uint8_t)*i == 0x00 &&
207 (uint8_t)*(++i) == 0x00 &&
208 (uint8_t)*(++i) == 0xFE &&
209 (uint8_t)*(++i) == 0xFF )
210 {
211 i++;
212 sio << "Verified big endian." << sio.nl;
213 }
214 else
215 {
216 i = sInput.begin();
217 sio << "Assuming big endian." << sio.nl;
218 }
219 for( ; i; i++ )
220 {
221 append( (((uint8_t)*i)<<24) |
222 (((uint8_t)*(++i))<<16) |
223 (((uint8_t)*(++i))<<8) |
224 ((uint8_t)*(++i))
225 );
226 }
227}
228
229void Bu::UtfString::setUtf32le( const Bu::String &sInput )
230{
231 Bu::String::const_iterator i = sInput.begin();
232 if( (uint8_t)*i == 0x00 &&
233 (uint8_t)*(++i) == 0x00 &&
234 (uint8_t)*(++i) == 0xFF &&
235 (uint8_t)*(++i) == 0xFE )
236 {
237 i++;
238 sio << "Verified little endian." << sio.nl;
239 }
240 else
241 {
242 i = sInput.begin();
243 sio << "Assuming little endian." << sio.nl;
244 }
245 for( ; i; i++ )
246 {
247 append( ((uint8_t)*i) |
248 (((uint8_t)*(++i))<<8) |
249 (((uint8_t)*(++i))<<16) |
250 (((uint8_t)*(++i))<<24)
251 );
252 }
253}
254
255void Bu::UtfString::write( Bu::Stream &sOut, Encoding eEnc )
256{
257 switch( eEnc )
258 {
259 case Utf8:
260 writeUtf8( sOut );
261 break;
262
263 case Utf16:
264 writeUtf16( sOut );
265 break;
266
267 case Utf16be:
268 writeUtf16be( sOut );
269 break;
270
271 case Utf16le:
272 writeUtf16le( sOut );
273 break;
274
275 case Utf32:
276 writeUtf32( sOut );
277 break;
278
279 case Utf32be:
280 writeUtf32be( sOut );
281 break;
282
283 case Utf32le:
284 writeUtf32le( sOut );
285 break;
286
287 case Ucs2:
288 throw Bu::ExceptionBase("Ucs2 not supported yet.");
289 break;
290
291 case Ucs4:
292 throw Bu::ExceptionBase("Ucs4 not supported yet.");
293 break;
294
295 case GuessEncoding:
296 throw Bu::ExceptionBase(
297 "GuessEncoding is incompatible with encoding.");
298 break;
299
300 }
301}
302
303void Bu::UtfString::writeUtf8( Bu::Stream &sOut )
304{
305}
306
307void Bu::UtfString::writeUtf16( Bu::Stream &sOut )
308{
309}
310
311void Bu::UtfString::writeUtf16be( Bu::Stream &sOut )
312{
313}
314
315void Bu::UtfString::writeUtf16le( Bu::Stream &sOut )
316{
317}
318
319void Bu::UtfString::writeUtf32( Bu::Stream &sOut )
320{
321}
322
323void Bu::UtfString::writeUtf32be( Bu::Stream &sOut )
324{
325}
326
327void Bu::UtfString::writeUtf32le( Bu::Stream &sOut )
328{
329}
122 330
123Bu::UtfChar Bu::UtfString::get( int iIndex ) 331Bu::UtfChar Bu::UtfString::get( int iIndex )
124{ 332{
125 Bu::UtfChar i = aData[iIndex]; 333 return nextChar( iIndex );
334}
335
336Bu::UtfChar Bu::UtfString::nextChar( int &iIndex )
337{
338 Bu::UtfChar i = aData[iIndex++];
126 switch( i&0xFC00 ) 339 switch( i&0xFC00 )
127 { 340 {
128 case 0xD800: 341 case 0xD800:
129 sio << "(hi) "; 342 return (((i&0x3FF)<<10) | ((aData[iIndex++]&0x3FF)))+0x10000;
130 return (((i&0x3FF)<<10) | ((aData[iIndex+1]&0x3FF)))+0x10000;
131 343
132 case 0xDC00: 344 case 0xDC00:
133 sio << "(lo) "; 345 return (((aData[iIndex-2]&0x3FF)<<10) | ((i&0x3FF)))+0x10000;
134 return 0;
135 346
136 default: 347 default:
137 sio << "(--) "; 348 return i;
138 return i&0xFC00;
139 } 349 }
140} 350}
141 351
diff --git a/src/utfstring.h b/src/utfstring.h
index 79ef62e..8448ea4 100644
--- a/src/utfstring.h
+++ b/src/utfstring.h
@@ -14,6 +14,7 @@
14namespace Bu 14namespace Bu
15{ 15{
16 class String; 16 class String;
17 class Stream;
17 18
18 /** 19 /**
19 * UtfChar isn't actually a character, unicode specifies "code points" not 20 * UtfChar isn't actually a character, unicode specifies "code points" not
@@ -35,7 +36,10 @@ namespace Bu
35 Utf16be, 36 Utf16be,
36 Utf16le, 37 Utf16le,
37 Utf32, 38 Utf32,
38 Ucs16, 39 Utf32be,
40 Utf32le,
41 Ucs2,
42 Ucs4,
39 GuessEncoding 43 GuessEncoding
40 }; 44 };
41 45
@@ -43,17 +47,59 @@ namespace Bu
43 UtfString( const Bu::String &sInput, Encoding eEnc=Utf8 ); 47 UtfString( const Bu::String &sInput, Encoding eEnc=Utf8 );
44 virtual ~UtfString(); 48 virtual ~UtfString();
45 49
50 class iterator
51 {
52 private:
53 iterator( UtfString *pSrc, int iCodePos ) :
54 pSrc( pSrc ), iCodePos( iCodePos )
55 {
56 }
57
58 public:
59 iterator() :
60 pSrc( NULL ), iCodePos( 0 )
61 {
62 }
63
64 UtfChar operator*()
65 {
66 if( !pSrc )
67 throw Bu::ExceptionBase("invalid UtfString::iterator dereferenced.");
68 return pSrc->nextChar( iCodePos );
69 }
70
71 private:
72 UtfString *pSrc;
73 int iCodePos;
74 };
75
46 void append( UtfChar ch ); 76 void append( UtfChar ch );
47 77
48 void set( const Bu::String &sInput, Encoding eEnc=Utf8 ); 78 void set( const Bu::String &sInput, Encoding eEnc=Utf8 );
49 void setUtf8( const Bu::String &sInput ); 79 void setUtf8( const Bu::String &sInput );
50 void setUtf16( const Bu::String &sInput ); 80 void setUtf16( const Bu::String &sInput );
51// void setUtf16be( const Bu::String &sInput ); 81 void setUtf16be( const Bu::String &sInput );
52// void setUtf16le( const Bu::String &sInput ); 82 void setUtf16le( const Bu::String &sInput );
83 void setUtf32( const Bu::String &sInput );
84 void setUtf32be( const Bu::String &sInput );
85 void setUtf32le( const Bu::String &sInput );
86
87 void write( Bu::Stream &sOut, Encoding eEnc=Utf8 );
88 void writeUtf8( Bu::Stream &sOut );
89 void writeUtf16( Bu::Stream &sOut );
90 void writeUtf16be( Bu::Stream &sOut );
91 void writeUtf16le( Bu::Stream &sOut );
92 void writeUtf32( Bu::Stream &sOut );
93 void writeUtf32be( Bu::Stream &sOut );
94 void writeUtf32le( Bu::Stream &sOut );
95
96 Bu::String to( Encoding eEnc=Utf8 );
97 Bu::String toUtf8();
53 98
54 void debug(); 99 void debug();
55 100
56 UtfChar get( int iIndex ); 101 UtfChar get( int iIndex );
102 UtfChar nextChar( int &iIndex );
57 103
58 private: 104 private:
59 void append16( uint16_t i ) { aData.append( i ); } 105 void append16( uint16_t i ) { aData.append( i ); }
diff --git a/test.utf16 b/test.utf16
new file mode 100644
index 0000000..86a63c3
--- /dev/null
+++ b/test.utf16
@@ -0,0 +1 @@
¥Ëæ)˜Ø=Þ<Ûÿßý$H \ No newline at end of file
diff --git a/test.utf16be b/test.utf16be
new file mode 100644
index 0000000..136ad1a
--- /dev/null
+++ b/test.utf16be
@@ -0,0 +1 @@
þÿ¥Ëæ)˜Ø=Þ<Ûÿßý$H \ No newline at end of file
diff --git a/test.utf16le b/test.utf16le
new file mode 100644
index 0000000..9f610d6
--- /dev/null
+++ b/test.utf16le
@@ -0,0 +1 @@
ÿþ¥Ëæ˜)=Ø<ÞÿÛýßH$ \ No newline at end of file
diff --git a/utf16.cpp b/utf16.cpp
new file mode 100644
index 0000000..eedb521
--- /dev/null
+++ b/utf16.cpp
@@ -0,0 +1,42 @@
1#include <stdio.h>
2#include <stdint.h>
3
4void bitprint( uint16_t u )
5{
6 for( int i = 15; i >= 0; i-- )
7 printf("%c", (u&(1<<i))?'1':'0');
8 printf("\n");
9}
10
11void bitprint( uint32_t u )
12{
13 for( int i = 31; i >= 0; i-- )
14 printf("%c", (u&(1<<i))?'1':'0');
15 printf("\n");
16}
17
18void utoutf16( uint32_t in, uint16_t &outHi, uint16_t &outLo )
19{
20 outHi = (((in-0x10000)>>10)&0x3FF)| 0xD800u;
21 outLo = ((in-0x10000)&0x3FF)| 0xDC00u;
22 printf("0x%X == 0x%X, 0x%X\n", in, outHi, outLo );
23}
24
25int32_t utf16tou( uint16_t hi, uint16_t lo )
26{
27 return (((uint32_t)hi&0x3FF)<<10 | lo&0x3FF)+0x10000;
28}
29
30int main()
31{
32 bitprint( 0xD800u );
33 bitprint( 0xDC00u );
34 uint16_t hi, lo;
35 utoutf16( 0x1D11E, hi, lo ); // Cat face with wry smile
36 utoutf16( 0x10FFFD, hi, lo ); // Cat face with wry smile
37 utoutf16( 0x1F63C, hi, lo ); // Cat face with wry smile
38 bitprint( hi );
39 bitprint( lo );
40 printf("0x%X\n", utf16tou( hi, lo ) );
41 return 0;
42}