summaryrefslogtreecommitdiff
path: root/src/utfstring.h
diff options
context:
space:
mode:
authorMike Buland <eichlan@xagasoft.com>2011-04-04 14:59:13 +0000
committerMike Buland <eichlan@xagasoft.com>2011-04-04 14:59:13 +0000
commit6ff23ac5097f7d92ac8840c2ba17d1dbf1eb80a5 (patch)
treefc70404d66854bba713bff2350f5f69f43bd85bc /src/utfstring.h
parentabbf45c1da7f3e3a542e6c6339a1bab31283f22e (diff)
downloadlibbu++-6ff23ac5097f7d92ac8840c2ba17d1dbf1eb80a5.tar.gz
libbu++-6ff23ac5097f7d92ac8840c2ba17d1dbf1eb80a5.tar.bz2
libbu++-6ff23ac5097f7d92ac8840c2ba17d1dbf1eb80a5.tar.xz
libbu++-6ff23ac5097f7d92ac8840c2ba17d1dbf1eb80a5.zip
UtfString is going really well. It can now parse Utf8, Utf16 (le,be), and
Utf32 (le,be). The internal storage seems to be working fine, although we do have a problem with random access, but at least we can tell which half of a surrogate pair we're on, so we can always rapidly determine the entire code point from any utf16 index that we're on. The only optomization that I'm not doing yet is reading in entire 16bit or 32bit words at a time and converting them from their byte order to native. There are a few potential issues with that, so we'll see. I added a couple of testing datafiles and a test program, I'll delete them all just as soon as it's verified to write correctly.
Diffstat (limited to 'src/utfstring.h')
-rw-r--r--src/utfstring.h52
1 files changed, 49 insertions, 3 deletions
diff --git a/src/utfstring.h b/src/utfstring.h
index 79ef62e..8448ea4 100644
--- a/src/utfstring.h
+++ b/src/utfstring.h
@@ -14,6 +14,7 @@
14namespace Bu 14namespace Bu
15{ 15{
16 class String; 16 class String;
17 class Stream;
17 18
18 /** 19 /**
19 * UtfChar isn't actually a character, unicode specifies "code points" not 20 * UtfChar isn't actually a character, unicode specifies "code points" not
@@ -35,7 +36,10 @@ namespace Bu
35 Utf16be, 36 Utf16be,
36 Utf16le, 37 Utf16le,
37 Utf32, 38 Utf32,
38 Ucs16, 39 Utf32be,
40 Utf32le,
41 Ucs2,
42 Ucs4,
39 GuessEncoding 43 GuessEncoding
40 }; 44 };
41 45
@@ -43,17 +47,59 @@ namespace Bu
43 UtfString( const Bu::String &sInput, Encoding eEnc=Utf8 ); 47 UtfString( const Bu::String &sInput, Encoding eEnc=Utf8 );
44 virtual ~UtfString(); 48 virtual ~UtfString();
45 49
50 class iterator
51 {
52 private:
53 iterator( UtfString *pSrc, int iCodePos ) :
54 pSrc( pSrc ), iCodePos( iCodePos )
55 {
56 }
57
58 public:
59 iterator() :
60 pSrc( NULL ), iCodePos( 0 )
61 {
62 }
63
64 UtfChar operator*()
65 {
66 if( !pSrc )
67 throw Bu::ExceptionBase("invalid UtfString::iterator dereferenced.");
68 return pSrc->nextChar( iCodePos );
69 }
70
71 private:
72 UtfString *pSrc;
73 int iCodePos;
74 };
75
46 void append( UtfChar ch ); 76 void append( UtfChar ch );
47 77
48 void set( const Bu::String &sInput, Encoding eEnc=Utf8 ); 78 void set( const Bu::String &sInput, Encoding eEnc=Utf8 );
49 void setUtf8( const Bu::String &sInput ); 79 void setUtf8( const Bu::String &sInput );
50 void setUtf16( const Bu::String &sInput ); 80 void setUtf16( const Bu::String &sInput );
51// void setUtf16be( const Bu::String &sInput ); 81 void setUtf16be( const Bu::String &sInput );
52// void setUtf16le( const Bu::String &sInput ); 82 void setUtf16le( const Bu::String &sInput );
83 void setUtf32( const Bu::String &sInput );
84 void setUtf32be( const Bu::String &sInput );
85 void setUtf32le( const Bu::String &sInput );
86
87 void write( Bu::Stream &sOut, Encoding eEnc=Utf8 );
88 void writeUtf8( Bu::Stream &sOut );
89 void writeUtf16( Bu::Stream &sOut );
90 void writeUtf16be( Bu::Stream &sOut );
91 void writeUtf16le( Bu::Stream &sOut );
92 void writeUtf32( Bu::Stream &sOut );
93 void writeUtf32be( Bu::Stream &sOut );
94 void writeUtf32le( Bu::Stream &sOut );
95
96 Bu::String to( Encoding eEnc=Utf8 );
97 Bu::String toUtf8();
53 98
54 void debug(); 99 void debug();
55 100
56 UtfChar get( int iIndex ); 101 UtfChar get( int iIndex );
102 UtfChar nextChar( int &iIndex );
57 103
58 private: 104 private:
59 void append16( uint16_t i ) { aData.append( i ); } 105 void append16( uint16_t i ) { aData.append( i ); }