diff options
Diffstat (limited to 'src')
| -rw-r--r-- | src/utfstring.cpp | 240 | ||||
| -rw-r--r-- | src/utfstring.h | 52 |
2 files changed, 274 insertions, 18 deletions
diff --git a/src/utfstring.cpp b/src/utfstring.cpp index bb0a011..7c4ba19 100644 --- a/src/utfstring.cpp +++ b/src/utfstring.cpp | |||
| @@ -8,9 +8,13 @@ | |||
| 8 | #include "bu/utfstring.h" | 8 | #include "bu/utfstring.h" |
| 9 | 9 | ||
| 10 | #include "bu/string.h" | 10 | #include "bu/string.h" |
| 11 | #include "bu/stream.h" | ||
| 11 | 12 | ||
| 12 | #include <endian.h> | 13 | #include <endian.h> |
| 13 | 14 | ||
| 15 | #include "bu/sio.h" | ||
| 16 | using Bu::sio; | ||
| 17 | |||
| 14 | Bu::UtfString::UtfString() | 18 | Bu::UtfString::UtfString() |
| 15 | { | 19 | { |
| 16 | } | 20 | } |
| @@ -33,20 +37,35 @@ void Bu::UtfString::set( const Bu::String &sInput, Encoding eEnc ) | |||
| 33 | break; | 37 | break; |
| 34 | 38 | ||
| 35 | case Utf16: | 39 | case Utf16: |
| 36 | case Utf16be: | ||
| 37 | setUtf16( sInput ); | 40 | setUtf16( sInput ); |
| 38 | break; | 41 | break; |
| 39 | 42 | ||
| 43 | case Utf16be: | ||
| 44 | setUtf16be( sInput ); | ||
| 45 | break; | ||
| 46 | |||
| 40 | case Utf16le: | 47 | case Utf16le: |
| 41 | throw Bu::ExceptionBase("Utf16le not supported yet."); | 48 | setUtf16le( sInput ); |
| 42 | break; | 49 | break; |
| 43 | 50 | ||
| 44 | case Utf32: | 51 | case Utf32: |
| 45 | throw Bu::ExceptionBase("Utf32 not supported yet."); | 52 | setUtf32( sInput ); |
| 53 | break; | ||
| 54 | |||
| 55 | case Utf32be: | ||
| 56 | setUtf32be( sInput ); | ||
| 57 | break; | ||
| 58 | |||
| 59 | case Utf32le: | ||
| 60 | setUtf32le( sInput ); | ||
| 61 | break; | ||
| 62 | |||
| 63 | case Ucs2: | ||
| 64 | throw Bu::ExceptionBase("Ucs2 not supported yet."); | ||
| 46 | break; | 65 | break; |
| 47 | 66 | ||
| 48 | case Ucs16: | 67 | case Ucs4: |
| 49 | throw Bu::ExceptionBase("Ucs16 not supported yet."); | 68 | throw Bu::ExceptionBase("Ucs4 not supported yet."); |
| 50 | break; | 69 | break; |
| 51 | 70 | ||
| 52 | case GuessEncoding: | 71 | case GuessEncoding: |
| @@ -104,8 +123,32 @@ void Bu::UtfString::setUtf8( const Bu::String &sInput ) | |||
| 104 | 123 | ||
| 105 | void Bu::UtfString::setUtf16( const Bu::String &sInput ) | 124 | void Bu::UtfString::setUtf16( const Bu::String &sInput ) |
| 106 | { | 125 | { |
| 126 | Bu::String::const_iterator i = sInput.begin(); | ||
| 127 | if( (uint8_t)*sInput.begin() == 0xFF && | ||
| 128 | (uint8_t)*(sInput.begin()+1) == 0xFE ) | ||
| 129 | { | ||
| 130 | setUtf16le( sInput ); | ||
| 131 | return; | ||
| 132 | } | ||
| 133 | setUtf16be( sInput ); | ||
| 134 | } | ||
| 135 | |||
| 136 | void Bu::UtfString::setUtf16be( const Bu::String &sInput ) | ||
| 137 | { | ||
| 138 | Bu::String::const_iterator i = sInput.begin(); | ||
| 139 | if( (uint8_t)*sInput.begin() == 0xFE && | ||
| 140 | (uint8_t)*(sInput.begin()+1) == 0xFF ) | ||
| 141 | |||
| 142 | { | ||
| 143 | i += 2; | ||
| 144 | sio << "Verified big endian." << sio.nl; | ||
| 145 | } | ||
| 146 | else | ||
| 147 | { | ||
| 148 | sio << "Assuming big endian." << sio.nl; | ||
| 149 | } | ||
| 107 | uint16_t hi, lo; | 150 | uint16_t hi, lo; |
| 108 | for( Bu::String::const_iterator i = sInput.begin(); i; i++ ) | 151 | for( ; i; i++ ) |
| 109 | { | 152 | { |
| 110 | hi = (((uint8_t)*i)<<8) | ((uint8_t)*(++i)); | 153 | hi = (((uint8_t)*i)<<8) | ((uint8_t)*(++i)); |
| 111 | append16( hi ); | 154 | append16( hi ); |
| @@ -117,25 +160,192 @@ void Bu::UtfString::setUtf16( const Bu::String &sInput ) | |||
| 117 | } | 160 | } |
| 118 | } | 161 | } |
| 119 | 162 | ||
| 120 | #include "bu/sio.h" | 163 | void Bu::UtfString::setUtf16le( const Bu::String &sInput ) |
| 121 | using Bu::sio; | 164 | { |
| 165 | Bu::String::const_iterator i = sInput.begin(); | ||
| 166 | if( (uint8_t)*sInput.begin() == 0xFF && | ||
| 167 | (uint8_t)*(sInput.begin()+1) == 0xFE ) | ||
| 168 | { | ||
| 169 | i += 2; | ||
| 170 | sio << "Verified little endian." << sio.nl; | ||
| 171 | } | ||
| 172 | else | ||
| 173 | { | ||
| 174 | sio << "Assuming little endian." << sio.nl; | ||
| 175 | } | ||
| 176 | uint16_t hi, lo; | ||
| 177 | for( ; i; i++ ) | ||
| 178 | { | ||
| 179 | hi = (((uint8_t)*i)) | ((uint8_t)*(++i)<<8); | ||
| 180 | append16( hi ); | ||
| 181 | if( (hi&0xD800u) == 0xD800u ) | ||
| 182 | { | ||
| 183 | lo = (((uint8_t)*(++i))) | ((uint8_t)*(++i)<<8); | ||
| 184 | append16( lo ); | ||
| 185 | } | ||
| 186 | } | ||
| 187 | } | ||
| 188 | |||
| 189 | void Bu::UtfString::setUtf32( const Bu::String &sInput ) | ||
| 190 | { | ||
| 191 | Bu::String::const_iterator i = sInput.begin(); | ||
| 192 | if( (uint8_t)*i == 0x00 && | ||
| 193 | (uint8_t)*(++i) == 0x00 && | ||
| 194 | (uint8_t)*(++i) == 0xFF && | ||
| 195 | (uint8_t)*(++i) == 0xFE ) | ||
| 196 | { | ||
| 197 | setUtf32le( sInput ); | ||
| 198 | return; | ||
| 199 | } | ||
| 200 | setUtf32be( sInput ); | ||
| 201 | } | ||
| 202 | |||
| 203 | void Bu::UtfString::setUtf32be( const Bu::String &sInput ) | ||
| 204 | { | ||
| 205 | Bu::String::const_iterator i = sInput.begin(); | ||
| 206 | if( (uint8_t)*i == 0x00 && | ||
| 207 | (uint8_t)*(++i) == 0x00 && | ||
| 208 | (uint8_t)*(++i) == 0xFE && | ||
| 209 | (uint8_t)*(++i) == 0xFF ) | ||
| 210 | { | ||
| 211 | i++; | ||
| 212 | sio << "Verified big endian." << sio.nl; | ||
| 213 | } | ||
| 214 | else | ||
| 215 | { | ||
| 216 | i = sInput.begin(); | ||
| 217 | sio << "Assuming big endian." << sio.nl; | ||
| 218 | } | ||
| 219 | for( ; i; i++ ) | ||
| 220 | { | ||
| 221 | append( (((uint8_t)*i)<<24) | | ||
| 222 | (((uint8_t)*(++i))<<16) | | ||
| 223 | (((uint8_t)*(++i))<<8) | | ||
| 224 | ((uint8_t)*(++i)) | ||
| 225 | ); | ||
| 226 | } | ||
| 227 | } | ||
| 228 | |||
| 229 | void Bu::UtfString::setUtf32le( const Bu::String &sInput ) | ||
| 230 | { | ||
| 231 | Bu::String::const_iterator i = sInput.begin(); | ||
| 232 | if( (uint8_t)*i == 0x00 && | ||
| 233 | (uint8_t)*(++i) == 0x00 && | ||
| 234 | (uint8_t)*(++i) == 0xFF && | ||
| 235 | (uint8_t)*(++i) == 0xFE ) | ||
| 236 | { | ||
| 237 | i++; | ||
| 238 | sio << "Verified little endian." << sio.nl; | ||
| 239 | } | ||
| 240 | else | ||
| 241 | { | ||
| 242 | i = sInput.begin(); | ||
| 243 | sio << "Assuming little endian." << sio.nl; | ||
| 244 | } | ||
| 245 | for( ; i; i++ ) | ||
| 246 | { | ||
| 247 | append( ((uint8_t)*i) | | ||
| 248 | (((uint8_t)*(++i))<<8) | | ||
| 249 | (((uint8_t)*(++i))<<16) | | ||
| 250 | (((uint8_t)*(++i))<<24) | ||
| 251 | ); | ||
| 252 | } | ||
| 253 | } | ||
| 254 | |||
| 255 | void Bu::UtfString::write( Bu::Stream &sOut, Encoding eEnc ) | ||
| 256 | { | ||
| 257 | switch( eEnc ) | ||
| 258 | { | ||
| 259 | case Utf8: | ||
| 260 | writeUtf8( sOut ); | ||
| 261 | break; | ||
| 262 | |||
| 263 | case Utf16: | ||
| 264 | writeUtf16( sOut ); | ||
| 265 | break; | ||
| 266 | |||
| 267 | case Utf16be: | ||
| 268 | writeUtf16be( sOut ); | ||
| 269 | break; | ||
| 270 | |||
| 271 | case Utf16le: | ||
| 272 | writeUtf16le( sOut ); | ||
| 273 | break; | ||
| 274 | |||
| 275 | case Utf32: | ||
| 276 | writeUtf32( sOut ); | ||
| 277 | break; | ||
| 278 | |||
| 279 | case Utf32be: | ||
| 280 | writeUtf32be( sOut ); | ||
| 281 | break; | ||
| 282 | |||
| 283 | case Utf32le: | ||
| 284 | writeUtf32le( sOut ); | ||
| 285 | break; | ||
| 286 | |||
| 287 | case Ucs2: | ||
| 288 | throw Bu::ExceptionBase("Ucs2 not supported yet."); | ||
| 289 | break; | ||
| 290 | |||
| 291 | case Ucs4: | ||
| 292 | throw Bu::ExceptionBase("Ucs4 not supported yet."); | ||
| 293 | break; | ||
| 294 | |||
| 295 | case GuessEncoding: | ||
| 296 | throw Bu::ExceptionBase( | ||
| 297 | "GuessEncoding is incompatible with encoding."); | ||
| 298 | break; | ||
| 299 | |||
| 300 | } | ||
| 301 | } | ||
| 302 | |||
| 303 | void Bu::UtfString::writeUtf8( Bu::Stream &sOut ) | ||
| 304 | { | ||
| 305 | } | ||
| 306 | |||
| 307 | void Bu::UtfString::writeUtf16( Bu::Stream &sOut ) | ||
| 308 | { | ||
| 309 | } | ||
| 310 | |||
| 311 | void Bu::UtfString::writeUtf16be( Bu::Stream &sOut ) | ||
| 312 | { | ||
| 313 | } | ||
| 314 | |||
| 315 | void Bu::UtfString::writeUtf16le( Bu::Stream &sOut ) | ||
| 316 | { | ||
| 317 | } | ||
| 318 | |||
| 319 | void Bu::UtfString::writeUtf32( Bu::Stream &sOut ) | ||
| 320 | { | ||
| 321 | } | ||
| 322 | |||
| 323 | void Bu::UtfString::writeUtf32be( Bu::Stream &sOut ) | ||
| 324 | { | ||
| 325 | } | ||
| 326 | |||
| 327 | void Bu::UtfString::writeUtf32le( Bu::Stream &sOut ) | ||
| 328 | { | ||
| 329 | } | ||
| 122 | 330 | ||
| 123 | Bu::UtfChar Bu::UtfString::get( int iIndex ) | 331 | Bu::UtfChar Bu::UtfString::get( int iIndex ) |
| 124 | { | 332 | { |
| 125 | Bu::UtfChar i = aData[iIndex]; | 333 | return nextChar( iIndex ); |
| 334 | } | ||
| 335 | |||
| 336 | Bu::UtfChar Bu::UtfString::nextChar( int &iIndex ) | ||
| 337 | { | ||
| 338 | Bu::UtfChar i = aData[iIndex++]; | ||
| 126 | switch( i&0xFC00 ) | 339 | switch( i&0xFC00 ) |
| 127 | { | 340 | { |
| 128 | case 0xD800: | 341 | case 0xD800: |
| 129 | sio << "(hi) "; | 342 | return (((i&0x3FF)<<10) | ((aData[iIndex++]&0x3FF)))+0x10000; |
| 130 | return (((i&0x3FF)<<10) | ((aData[iIndex+1]&0x3FF)))+0x10000; | ||
| 131 | 343 | ||
| 132 | case 0xDC00: | 344 | case 0xDC00: |
| 133 | sio << "(lo) "; | 345 | return (((aData[iIndex-2]&0x3FF)<<10) | ((i&0x3FF)))+0x10000; |
| 134 | return 0; | ||
| 135 | 346 | ||
| 136 | default: | 347 | default: |
| 137 | sio << "(--) "; | 348 | return i; |
| 138 | return i&0xFC00; | ||
| 139 | } | 349 | } |
| 140 | } | 350 | } |
| 141 | 351 | ||
diff --git a/src/utfstring.h b/src/utfstring.h index 79ef62e..8448ea4 100644 --- a/src/utfstring.h +++ b/src/utfstring.h | |||
| @@ -14,6 +14,7 @@ | |||
| 14 | namespace Bu | 14 | namespace Bu |
| 15 | { | 15 | { |
| 16 | class String; | 16 | class String; |
| 17 | class Stream; | ||
| 17 | 18 | ||
| 18 | /** | 19 | /** |
| 19 | * UtfChar isn't actually a character, unicode specifies "code points" not | 20 | * UtfChar isn't actually a character, unicode specifies "code points" not |
| @@ -35,7 +36,10 @@ namespace Bu | |||
| 35 | Utf16be, | 36 | Utf16be, |
| 36 | Utf16le, | 37 | Utf16le, |
| 37 | Utf32, | 38 | Utf32, |
| 38 | Ucs16, | 39 | Utf32be, |
| 40 | Utf32le, | ||
| 41 | Ucs2, | ||
| 42 | Ucs4, | ||
| 39 | GuessEncoding | 43 | GuessEncoding |
| 40 | }; | 44 | }; |
| 41 | 45 | ||
| @@ -43,17 +47,59 @@ namespace Bu | |||
| 43 | UtfString( const Bu::String &sInput, Encoding eEnc=Utf8 ); | 47 | UtfString( const Bu::String &sInput, Encoding eEnc=Utf8 ); |
| 44 | virtual ~UtfString(); | 48 | virtual ~UtfString(); |
| 45 | 49 | ||
| 50 | class iterator | ||
| 51 | { | ||
| 52 | private: | ||
| 53 | iterator( UtfString *pSrc, int iCodePos ) : | ||
| 54 | pSrc( pSrc ), iCodePos( iCodePos ) | ||
| 55 | { | ||
| 56 | } | ||
| 57 | |||
| 58 | public: | ||
| 59 | iterator() : | ||
| 60 | pSrc( NULL ), iCodePos( 0 ) | ||
| 61 | { | ||
| 62 | } | ||
| 63 | |||
| 64 | UtfChar operator*() | ||
| 65 | { | ||
| 66 | if( !pSrc ) | ||
| 67 | throw Bu::ExceptionBase("invalid UtfString::iterator dereferenced."); | ||
| 68 | return pSrc->nextChar( iCodePos ); | ||
| 69 | } | ||
| 70 | |||
| 71 | private: | ||
| 72 | UtfString *pSrc; | ||
| 73 | int iCodePos; | ||
| 74 | }; | ||
| 75 | |||
| 46 | void append( UtfChar ch ); | 76 | void append( UtfChar ch ); |
| 47 | 77 | ||
| 48 | void set( const Bu::String &sInput, Encoding eEnc=Utf8 ); | 78 | void set( const Bu::String &sInput, Encoding eEnc=Utf8 ); |
| 49 | void setUtf8( const Bu::String &sInput ); | 79 | void setUtf8( const Bu::String &sInput ); |
| 50 | void setUtf16( const Bu::String &sInput ); | 80 | void setUtf16( const Bu::String &sInput ); |
| 51 | // void setUtf16be( const Bu::String &sInput ); | 81 | void setUtf16be( const Bu::String &sInput ); |
| 52 | // void setUtf16le( const Bu::String &sInput ); | 82 | void setUtf16le( const Bu::String &sInput ); |
| 83 | void setUtf32( const Bu::String &sInput ); | ||
| 84 | void setUtf32be( const Bu::String &sInput ); | ||
| 85 | void setUtf32le( const Bu::String &sInput ); | ||
| 86 | |||
| 87 | void write( Bu::Stream &sOut, Encoding eEnc=Utf8 ); | ||
| 88 | void writeUtf8( Bu::Stream &sOut ); | ||
| 89 | void writeUtf16( Bu::Stream &sOut ); | ||
| 90 | void writeUtf16be( Bu::Stream &sOut ); | ||
| 91 | void writeUtf16le( Bu::Stream &sOut ); | ||
| 92 | void writeUtf32( Bu::Stream &sOut ); | ||
| 93 | void writeUtf32be( Bu::Stream &sOut ); | ||
| 94 | void writeUtf32le( Bu::Stream &sOut ); | ||
| 95 | |||
| 96 | Bu::String to( Encoding eEnc=Utf8 ); | ||
| 97 | Bu::String toUtf8(); | ||
| 53 | 98 | ||
| 54 | void debug(); | 99 | void debug(); |
| 55 | 100 | ||
| 56 | UtfChar get( int iIndex ); | 101 | UtfChar get( int iIndex ); |
| 102 | UtfChar nextChar( int &iIndex ); | ||
| 57 | 103 | ||
| 58 | private: | 104 | private: |
| 59 | void append16( uint16_t i ) { aData.append( i ); } | 105 | void append16( uint16_t i ) { aData.append( i ); } |
