diff options
author | Mike Buland <eichlan@xagasoft.com> | 2011-04-08 00:51:24 +0000 |
---|---|---|
committer | Mike Buland <eichlan@xagasoft.com> | 2011-04-08 00:51:24 +0000 |
commit | bc6d543210a9df6f578229c6050371ced665fd69 (patch) | |
tree | cf8bd3ac5b204d40697089e56ac3cb1445d960b4 /src/utfstring.h | |
parent | 27aecbc60be6c80ce221f29c01f743de714faa63 (diff) | |
download | libbu++-bc6d543210a9df6f578229c6050371ced665fd69.tar.gz libbu++-bc6d543210a9df6f578229c6050371ced665fd69.tar.bz2 libbu++-bc6d543210a9df6f578229c6050371ced665fd69.tar.xz libbu++-bc6d543210a9df6f578229c6050371ced665fd69.zip |
Rearranged the API a bit.
Diffstat (limited to 'src/utfstring.h')
-rw-r--r-- | src/utfstring.h | 88 |
1 files changed, 75 insertions, 13 deletions
diff --git a/src/utfstring.h b/src/utfstring.h index be3e6ad..477e272 100644 --- a/src/utfstring.h +++ b/src/utfstring.h | |||
@@ -26,6 +26,33 @@ namespace Bu | |||
26 | */ | 26 | */ |
27 | typedef uint32_t UtfChar; | 27 | typedef uint32_t UtfChar; |
28 | 28 | ||
29 | /** | ||
30 | * A unicode string. This class represents a string of unicode code points. | ||
31 | * Every character in unicode can be represented with 21 bits, but we don't | ||
32 | * have a datatype that's 24 bits long, so we return all code points as a | ||
33 | * 32 bit unsigned value represented by Bu::UtfChar. However, the UtfString | ||
34 | * class, for efficiency purposes doesn't store 32 bit values internally. | ||
35 | * It represents all code points in the native utf16 encodeng. This means | ||
36 | * that it may be very difficult to quickly determine the length of a | ||
37 | * UtfString in code points. Unlike many Unicode handling systems, this | ||
38 | * one actually works with complete code points. When using this class you | ||
39 | * don't ever have to know about the inner workings of the different | ||
40 | * encoding schemes. All of the data is dealt with as whole code points. | ||
41 | * | ||
42 | * As an aside, this means that when encoding a UtfString to a Utf16 | ||
43 | * encoding that matches your archetecture this operation will be very | ||
44 | * fast since it will effectively be a raw dump of the internal data | ||
45 | * structures. However, it is highly reccomended that you DO NOT use the | ||
46 | * little endian encodings if you can possibly avoid it. They are not | ||
47 | * reccomended by the Unicode Consortium and are mainly supported as a | ||
48 | * means of communicating with other systems that encode their data | ||
49 | * incorrectly. That said, whenever UtfString encodes the contained string | ||
50 | * it always includes a BOM at the begining (the byte order marker) so that | ||
51 | * proper byte order can be easily determined by the program reading the | ||
52 | * data. | ||
53 | * | ||
54 | *@todo Investigate http://www.unicode.org/reports/tr6/ for compression. | ||
55 | */ | ||
29 | class UtfString | 56 | class UtfString |
30 | { | 57 | { |
31 | public: | 58 | public: |
@@ -73,9 +100,56 @@ namespace Bu | |||
73 | int iCodePos; | 100 | int iCodePos; |
74 | }; | 101 | }; |
75 | 102 | ||
103 | /** | ||
104 | * Append a UtfChar (A unicode code point) to the string. This can be | ||
105 | * any valid code point, and is just the value of the code point, no | ||
106 | * encoding necessary. | ||
107 | */ | ||
76 | void append( UtfChar ch ); | 108 | void append( UtfChar ch ); |
77 | 109 | ||
110 | /** | ||
111 | * Set the value of the entire string based on the given input and | ||
112 | * encoding. The default encoding is Utf8, which is compatible with | ||
113 | * 7-bit ascii, so it's a great choice for setting UtfStrings from | ||
114 | * string literals in code. | ||
115 | */ | ||
78 | void set( const Bu::String &sInput, Encoding eEnc=Utf8 ); | 116 | void set( const Bu::String &sInput, Encoding eEnc=Utf8 ); |
117 | |||
118 | /** | ||
119 | * This encodes the UtfString in the given encoding and outputs it to | ||
120 | * the provided stream. all Utf16 and Utf32 encodings will have the | ||
121 | * correct BOM (byte order marker) at the begining. | ||
122 | */ | ||
123 | void write( Bu::Stream &sOut, Encoding eEnc=Utf8 ); | ||
124 | |||
125 | /** | ||
126 | * This encodes the UtfString in the given encoding and returns it as | ||
127 | * a binary Bu::String. Like write, this also includes the proper BOM | ||
128 | * at the begining. | ||
129 | */ | ||
130 | Bu::String get( Encoding eEnc=Utf8 ); | ||
131 | |||
132 | void debug(); | ||
133 | |||
134 | /** | ||
135 | * This may or may not stick around, given an index, this returns a | ||
136 | * codepoint, however there isn't necesarilly a 1:1 ratio between | ||
137 | * indexes and code points. | ||
138 | */ | ||
139 | UtfChar get( int iIndex ); | ||
140 | |||
141 | /** | ||
142 | * This is what to use if you want to iterate through a section of the | ||
143 | * UtfString and you want to use a numerical index. In most cases it | ||
144 | * will be much easier to use an iterator, though. Given an index this | ||
145 | * will return the codepoint at that position and increment iIndex an | ||
146 | * appropriate amount for it to point to the next code point. | ||
147 | */ | ||
148 | UtfChar nextChar( int &iIndex ); | ||
149 | |||
150 | private: | ||
151 | void append16( uint16_t i ) { aData.append( i ); } | ||
152 | |||
79 | void setUtf8( const Bu::String &sInput ); | 153 | void setUtf8( const Bu::String &sInput ); |
80 | void setUtf16( const Bu::String &sInput ); | 154 | void setUtf16( const Bu::String &sInput ); |
81 | void setUtf16be( const Bu::String &sInput ); | 155 | void setUtf16be( const Bu::String &sInput ); |
@@ -83,25 +157,13 @@ namespace Bu | |||
83 | void setUtf32( const Bu::String &sInput ); | 157 | void setUtf32( const Bu::String &sInput ); |
84 | void setUtf32be( const Bu::String &sInput ); | 158 | void setUtf32be( const Bu::String &sInput ); |
85 | void setUtf32le( const Bu::String &sInput ); | 159 | void setUtf32le( const Bu::String &sInput ); |
86 | 160 | ||
87 | void write( Bu::Stream &sOut, Encoding eEnc=Utf8 ); | ||
88 | void writeUtf8( Bu::Stream &sOut ); | 161 | void writeUtf8( Bu::Stream &sOut ); |
89 | void writeUtf16be( Bu::Stream &sOut ); | 162 | void writeUtf16be( Bu::Stream &sOut ); |
90 | void writeUtf16le( Bu::Stream &sOut ); | 163 | void writeUtf16le( Bu::Stream &sOut ); |
91 | void writeUtf32be( Bu::Stream &sOut ); | 164 | void writeUtf32be( Bu::Stream &sOut ); |
92 | void writeUtf32le( Bu::Stream &sOut ); | 165 | void writeUtf32le( Bu::Stream &sOut ); |
93 | 166 | ||
94 | Bu::String to( Encoding eEnc=Utf8 ); | ||
95 | Bu::String toUtf8(); | ||
96 | |||
97 | void debug(); | ||
98 | |||
99 | UtfChar get( int iIndex ); | ||
100 | UtfChar nextChar( int &iIndex ); | ||
101 | |||
102 | private: | ||
103 | void append16( uint16_t i ) { aData.append( i ); } | ||
104 | |||
105 | private: | 167 | private: |
106 | Bu::Array<uint16_t> aData; | 168 | Bu::Array<uint16_t> aData; |
107 | int iRawLen; | 169 | int iRawLen; |