summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--src/utfstring.cpp240
-rw-r--r--src/utfstring.h52
-rw-r--r--test.utf161
-rw-r--r--test.utf16be1
-rw-r--r--test.utf16le1
-rw-r--r--utf16.cpp42
6 files changed, 319 insertions, 18 deletions
diff --git a/src/utfstring.cpp b/src/utfstring.cpp
index bb0a011..7c4ba19 100644
--- a/src/utfstring.cpp
+++ b/src/utfstring.cpp
@@ -8,9 +8,13 @@
8#include "bu/utfstring.h" 8#include "bu/utfstring.h"
9 9
10#include "bu/string.h" 10#include "bu/string.h"
11#include "bu/stream.h"
11 12
12#include <endian.h> 13#include <endian.h>
13 14
15#include "bu/sio.h"
16using Bu::sio;
17
14Bu::UtfString::UtfString() 18Bu::UtfString::UtfString()
15{ 19{
16} 20}
@@ -33,20 +37,35 @@ void Bu::UtfString::set( const Bu::String &sInput, Encoding eEnc )
33 break; 37 break;
34 38
35 case Utf16: 39 case Utf16:
36 case Utf16be:
37 setUtf16( sInput ); 40 setUtf16( sInput );
38 break; 41 break;
39 42
43 case Utf16be:
44 setUtf16be( sInput );
45 break;
46
40 case Utf16le: 47 case Utf16le:
41 throw Bu::ExceptionBase("Utf16le not supported yet."); 48 setUtf16le( sInput );
42 break; 49 break;
43 50
44 case Utf32: 51 case Utf32:
45 throw Bu::ExceptionBase("Utf32 not supported yet."); 52 setUtf32( sInput );
53 break;
54
55 case Utf32be:
56 setUtf32be( sInput );
57 break;
58
59 case Utf32le:
60 setUtf32le( sInput );
61 break;
62
63 case Ucs2:
64 throw Bu::ExceptionBase("Ucs2 not supported yet.");
46 break; 65 break;
47 66
48 case Ucs16: 67 case Ucs4:
49 throw Bu::ExceptionBase("Ucs16 not supported yet."); 68 throw Bu::ExceptionBase("Ucs4 not supported yet.");
50 break; 69 break;
51 70
52 case GuessEncoding: 71 case GuessEncoding:
@@ -104,8 +123,32 @@ void Bu::UtfString::setUtf8( const Bu::String &sInput )
104 123
105void Bu::UtfString::setUtf16( const Bu::String &sInput ) 124void Bu::UtfString::setUtf16( const Bu::String &sInput )
106{ 125{
126 Bu::String::const_iterator i = sInput.begin();
127 if( (uint8_t)*sInput.begin() == 0xFF &&
128 (uint8_t)*(sInput.begin()+1) == 0xFE )
129 {
130 setUtf16le( sInput );
131 return;
132 }
133 setUtf16be( sInput );
134}
135
136void Bu::UtfString::setUtf16be( const Bu::String &sInput )
137{
138 Bu::String::const_iterator i = sInput.begin();
139 if( (uint8_t)*sInput.begin() == 0xFE &&
140 (uint8_t)*(sInput.begin()+1) == 0xFF )
141
142 {
143 i += 2;
144 sio << "Verified big endian." << sio.nl;
145 }
146 else
147 {
148 sio << "Assuming big endian." << sio.nl;
149 }
107 uint16_t hi, lo; 150 uint16_t hi, lo;
108 for( Bu::String::const_iterator i = sInput.begin(); i; i++ ) 151 for( ; i; i++ )
109 { 152 {
110 hi = (((uint8_t)*i)<<8) | ((uint8_t)*(++i)); 153 hi = (((uint8_t)*i)<<8) | ((uint8_t)*(++i));
111 append16( hi ); 154 append16( hi );
@@ -117,25 +160,192 @@ void Bu::UtfString::setUtf16( const Bu::String &sInput )
117 } 160 }
118} 161}
119 162
120#include "bu/sio.h" 163void Bu::UtfString::setUtf16le( const Bu::String &sInput )
121using Bu::sio; 164{
165 Bu::String::const_iterator i = sInput.begin();
166 if( (uint8_t)*sInput.begin() == 0xFF &&
167 (uint8_t)*(sInput.begin()+1) == 0xFE )
168 {
169 i += 2;
170 sio << "Verified little endian." << sio.nl;
171 }
172 else
173 {
174 sio << "Assuming little endian." << sio.nl;
175 }
176 uint16_t hi, lo;
177 for( ; i; i++ )
178 {
179 hi = (((uint8_t)*i)) | ((uint8_t)*(++i)<<8);
180 append16( hi );
181 if( (hi&0xD800u) == 0xD800u )
182 {
183 lo = (((uint8_t)*(++i))) | ((uint8_t)*(++i)<<8);
184 append16( lo );
185 }
186 }
187}
188
189void Bu::UtfString::setUtf32( const Bu::String &sInput )
190{
191 Bu::String::const_iterator i = sInput.begin();
192 if( (uint8_t)*i == 0x00 &&
193 (uint8_t)*(++i) == 0x00 &&
194 (uint8_t)*(++i) == 0xFF &&
195 (uint8_t)*(++i) == 0xFE )
196 {
197 setUtf32le( sInput );
198 return;
199 }
200 setUtf32be( sInput );
201}
202
203void Bu::UtfString::setUtf32be( const Bu::String &sInput )
204{
205 Bu::String::const_iterator i = sInput.begin();
206 if( (uint8_t)*i == 0x00 &&
207 (uint8_t)*(++i) == 0x00 &&
208 (uint8_t)*(++i) == 0xFE &&
209 (uint8_t)*(++i) == 0xFF )
210 {
211 i++;
212 sio << "Verified big endian." << sio.nl;
213 }
214 else
215 {
216 i = sInput.begin();
217 sio << "Assuming big endian." << sio.nl;
218 }
219 for( ; i; i++ )
220 {
221 append( (((uint8_t)*i)<<24) |
222 (((uint8_t)*(++i))<<16) |
223 (((uint8_t)*(++i))<<8) |
224 ((uint8_t)*(++i))
225 );
226 }
227}
228
229void Bu::UtfString::setUtf32le( const Bu::String &sInput )
230{
231 Bu::String::const_iterator i = sInput.begin();
232 if( (uint8_t)*i == 0x00 &&
233 (uint8_t)*(++i) == 0x00 &&
234 (uint8_t)*(++i) == 0xFF &&
235 (uint8_t)*(++i) == 0xFE )
236 {
237 i++;
238 sio << "Verified little endian." << sio.nl;
239 }
240 else
241 {
242 i = sInput.begin();
243 sio << "Assuming little endian." << sio.nl;
244 }
245 for( ; i; i++ )
246 {
247 append( ((uint8_t)*i) |
248 (((uint8_t)*(++i))<<8) |
249 (((uint8_t)*(++i))<<16) |
250 (((uint8_t)*(++i))<<24)
251 );
252 }
253}
254
255void Bu::UtfString::write( Bu::Stream &sOut, Encoding eEnc )
256{
257 switch( eEnc )
258 {
259 case Utf8:
260 writeUtf8( sOut );
261 break;
262
263 case Utf16:
264 writeUtf16( sOut );
265 break;
266
267 case Utf16be:
268 writeUtf16be( sOut );
269 break;
270
271 case Utf16le:
272 writeUtf16le( sOut );
273 break;
274
275 case Utf32:
276 writeUtf32( sOut );
277 break;
278
279 case Utf32be:
280 writeUtf32be( sOut );
281 break;
282
283 case Utf32le:
284 writeUtf32le( sOut );
285 break;
286
287 case Ucs2:
288 throw Bu::ExceptionBase("Ucs2 not supported yet.");
289 break;
290
291 case Ucs4:
292 throw Bu::ExceptionBase("Ucs4 not supported yet.");
293 break;
294
295 case GuessEncoding:
296 throw Bu::ExceptionBase(
297 "GuessEncoding is incompatible with encoding.");
298 break;
299
300 }
301}
302
303void Bu::UtfString::writeUtf8( Bu::Stream &sOut )
304{
305}
306
307void Bu::UtfString::writeUtf16( Bu::Stream &sOut )
308{
309}
310
311void Bu::UtfString::writeUtf16be( Bu::Stream &sOut )
312{
313}
314
315void Bu::UtfString::writeUtf16le( Bu::Stream &sOut )
316{
317}
318
319void Bu::UtfString::writeUtf32( Bu::Stream &sOut )
320{
321}
322
323void Bu::UtfString::writeUtf32be( Bu::Stream &sOut )
324{
325}
326
327void Bu::UtfString::writeUtf32le( Bu::Stream &sOut )
328{
329}
122 330
123Bu::UtfChar Bu::UtfString::get( int iIndex ) 331Bu::UtfChar Bu::UtfString::get( int iIndex )
124{ 332{
125 Bu::UtfChar i = aData[iIndex]; 333 return nextChar( iIndex );
334}
335
336Bu::UtfChar Bu::UtfString::nextChar( int &iIndex )
337{
338 Bu::UtfChar i = aData[iIndex++];
126 switch( i&0xFC00 ) 339 switch( i&0xFC00 )
127 { 340 {
128 case 0xD800: 341 case 0xD800:
129 sio << "(hi) "; 342 return (((i&0x3FF)<<10) | ((aData[iIndex++]&0x3FF)))+0x10000;
130 return (((i&0x3FF)<<10) | ((aData[iIndex+1]&0x3FF)))+0x10000;
131 343
132 case 0xDC00: 344 case 0xDC00:
133 sio << "(lo) "; 345 return (((aData[iIndex-2]&0x3FF)<<10) | ((i&0x3FF)))+0x10000;
134 return 0;
135 346
136 default: 347 default:
137 sio << "(--) "; 348 return i;
138 return i&0xFC00;
139 } 349 }
140} 350}
141 351
diff --git a/src/utfstring.h b/src/utfstring.h
index 79ef62e..8448ea4 100644
--- a/src/utfstring.h
+++ b/src/utfstring.h
@@ -14,6 +14,7 @@
14namespace Bu 14namespace Bu
15{ 15{
16 class String; 16 class String;
17 class Stream;
17 18
18 /** 19 /**
19 * UtfChar isn't actually a character, unicode specifies "code points" not 20 * UtfChar isn't actually a character, unicode specifies "code points" not
@@ -35,7 +36,10 @@ namespace Bu
35 Utf16be, 36 Utf16be,
36 Utf16le, 37 Utf16le,
37 Utf32, 38 Utf32,
38 Ucs16, 39 Utf32be,
40 Utf32le,
41 Ucs2,
42 Ucs4,
39 GuessEncoding 43 GuessEncoding
40 }; 44 };
41 45
@@ -43,17 +47,59 @@ namespace Bu
43 UtfString( const Bu::String &sInput, Encoding eEnc=Utf8 ); 47 UtfString( const Bu::String &sInput, Encoding eEnc=Utf8 );
44 virtual ~UtfString(); 48 virtual ~UtfString();
45 49
50 class iterator
51 {
52 private:
53 iterator( UtfString *pSrc, int iCodePos ) :
54 pSrc( pSrc ), iCodePos( iCodePos )
55 {
56 }
57
58 public:
59 iterator() :
60 pSrc( NULL ), iCodePos( 0 )
61 {
62 }
63
64 UtfChar operator*()
65 {
66 if( !pSrc )
67 throw Bu::ExceptionBase("invalid UtfString::iterator dereferenced.");
68 return pSrc->nextChar( iCodePos );
69 }
70
71 private:
72 UtfString *pSrc;
73 int iCodePos;
74 };
75
46 void append( UtfChar ch ); 76 void append( UtfChar ch );
47 77
48 void set( const Bu::String &sInput, Encoding eEnc=Utf8 ); 78 void set( const Bu::String &sInput, Encoding eEnc=Utf8 );
49 void setUtf8( const Bu::String &sInput ); 79 void setUtf8( const Bu::String &sInput );
50 void setUtf16( const Bu::String &sInput ); 80 void setUtf16( const Bu::String &sInput );
51// void setUtf16be( const Bu::String &sInput ); 81 void setUtf16be( const Bu::String &sInput );
52// void setUtf16le( const Bu::String &sInput ); 82 void setUtf16le( const Bu::String &sInput );
83 void setUtf32( const Bu::String &sInput );
84 void setUtf32be( const Bu::String &sInput );
85 void setUtf32le( const Bu::String &sInput );
86
87 void write( Bu::Stream &sOut, Encoding eEnc=Utf8 );
88 void writeUtf8( Bu::Stream &sOut );
89 void writeUtf16( Bu::Stream &sOut );
90 void writeUtf16be( Bu::Stream &sOut );
91 void writeUtf16le( Bu::Stream &sOut );
92 void writeUtf32( Bu::Stream &sOut );
93 void writeUtf32be( Bu::Stream &sOut );
94 void writeUtf32le( Bu::Stream &sOut );
95
96 Bu::String to( Encoding eEnc=Utf8 );
97 Bu::String toUtf8();
53 98
54 void debug(); 99 void debug();
55 100
56 UtfChar get( int iIndex ); 101 UtfChar get( int iIndex );
102 UtfChar nextChar( int &iIndex );
57 103
58 private: 104 private:
59 void append16( uint16_t i ) { aData.append( i ); } 105 void append16( uint16_t i ) { aData.append( i ); }
diff --git a/test.utf16 b/test.utf16
new file mode 100644
index 0000000..86a63c3
--- /dev/null
+++ b/test.utf16
@@ -0,0 +1 @@
¥Ëæ)˜Ø=Þ<Ûÿßý$H \ No newline at end of file
diff --git a/test.utf16be b/test.utf16be
new file mode 100644
index 0000000..136ad1a
--- /dev/null
+++ b/test.utf16be
@@ -0,0 +1 @@
þÿ¥Ëæ)˜Ø=Þ<Ûÿßý$H \ No newline at end of file
diff --git a/test.utf16le b/test.utf16le
new file mode 100644
index 0000000..9f610d6
--- /dev/null
+++ b/test.utf16le
@@ -0,0 +1 @@
ÿþ¥Ëæ˜)=Ø<ÞÿÛýßH$ \ No newline at end of file
diff --git a/utf16.cpp b/utf16.cpp
new file mode 100644
index 0000000..eedb521
--- /dev/null
+++ b/utf16.cpp
@@ -0,0 +1,42 @@
1#include <stdio.h>
2#include <stdint.h>
3
4void bitprint( uint16_t u )
5{
6 for( int i = 15; i >= 0; i-- )
7 printf("%c", (u&(1<<i))?'1':'0');
8 printf("\n");
9}
10
11void bitprint( uint32_t u )
12{
13 for( int i = 31; i >= 0; i-- )
14 printf("%c", (u&(1<<i))?'1':'0');
15 printf("\n");
16}
17
18void utoutf16( uint32_t in, uint16_t &outHi, uint16_t &outLo )
19{
20 outHi = (((in-0x10000)>>10)&0x3FF)| 0xD800u;
21 outLo = ((in-0x10000)&0x3FF)| 0xDC00u;
22 printf("0x%X == 0x%X, 0x%X\n", in, outHi, outLo );
23}
24
25int32_t utf16tou( uint16_t hi, uint16_t lo )
26{
27 return (((uint32_t)hi&0x3FF)<<10 | lo&0x3FF)+0x10000;
28}
29
30int main()
31{
32 bitprint( 0xD800u );
33 bitprint( 0xDC00u );
34 uint16_t hi, lo;
35 utoutf16( 0x1D11E, hi, lo ); // Cat face with wry smile
36 utoutf16( 0x10FFFD, hi, lo ); // Cat face with wry smile
37 utoutf16( 0x1F63C, hi, lo ); // Cat face with wry smile
38 bitprint( hi );
39 bitprint( lo );
40 printf("0x%X\n", utf16tou( hi, lo ) );
41 return 0;
42}