From 6ff23ac5097f7d92ac8840c2ba17d1dbf1eb80a5 Mon Sep 17 00:00:00 2001 From: Mike Buland Date: Mon, 4 Apr 2011 14:59:13 +0000 Subject: UtfString is going really well. It can now parse Utf8, Utf16 (le,be), and Utf32 (le,be). The internal storage seems to be working fine, although we do have a problem with random access, but at least we can tell which half of a surrogate pair we're on, so we can always rapidly determine the entire code point from any utf16 index that we're on. The only optomization that I'm not doing yet is reading in entire 16bit or 32bit words at a time and converting them from their byte order to native. There are a few potential issues with that, so we'll see. I added a couple of testing datafiles and a test program, I'll delete them all just as soon as it's verified to write correctly. --- utf16.cpp | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) create mode 100644 utf16.cpp (limited to 'utf16.cpp') diff --git a/utf16.cpp b/utf16.cpp new file mode 100644 index 0000000..eedb521 --- /dev/null +++ b/utf16.cpp @@ -0,0 +1,42 @@ +#include +#include + +void bitprint( uint16_t u ) +{ + for( int i = 15; i >= 0; i-- ) + printf("%c", (u&(1<= 0; i-- ) + printf("%c", (u&(1<>10)&0x3FF)| 0xD800u; + outLo = ((in-0x10000)&0x3FF)| 0xDC00u; + printf("0x%X == 0x%X, 0x%X\n", in, outHi, outLo ); +} + +int32_t utf16tou( uint16_t hi, uint16_t lo ) +{ + return (((uint32_t)hi&0x3FF)<<10 | lo&0x3FF)+0x10000; +} + +int main() +{ + bitprint( 0xD800u ); + bitprint( 0xDC00u ); + uint16_t hi, lo; + utoutf16( 0x1D11E, hi, lo ); // Cat face with wry smile + utoutf16( 0x10FFFD, hi, lo ); // Cat face with wry smile + utoutf16( 0x1F63C, hi, lo ); // Cat face with wry smile + bitprint( hi ); + bitprint( lo ); + printf("0x%X\n", utf16tou( hi, lo ) ); + return 0; +} -- cgit v1.2.3