aboutsummaryrefslogtreecommitdiff
path: root/src/unstable/utfstring.h
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--src/unstable/utfstring.h482
1 files changed, 241 insertions, 241 deletions
diff --git a/src/unstable/utfstring.h b/src/unstable/utfstring.h
index 1bd4cce..560faae 100644
--- a/src/unstable/utfstring.h
+++ b/src/unstable/utfstring.h
@@ -13,247 +13,247 @@
13 13
14namespace Bu 14namespace Bu
15{ 15{
16 class String; 16 class String;
17 class Stream; 17 class Stream;
18 18
19 /** 19 /**
20 * UtfChar isn't actually a character, unicode specifies "code points" not 20 * UtfChar isn't actually a character, unicode specifies "code points" not
21 * characters. The main reason for this is that not all code points define 21 * characters. The main reason for this is that not all code points define
22 * usable characters. Some control text directionality, some apply 22 * usable characters. Some control text directionality, some apply
23 * properties to other code points which are characters. However, most of 23 * properties to other code points which are characters. However, most of
24 * these distinctions are only important when implementing displays that 24 * these distinctions are only important when implementing displays that
25 * comply with the Unicode standard fully. 25 * comply with the Unicode standard fully.
26 */ 26 */
27 typedef uint32_t UtfChar; 27 typedef uint32_t UtfChar;
28 28
29 /** 29 /**
30 * A unicode string. This class represents a string of unicode code points. 30 * A unicode string. This class represents a string of unicode code points.
31 * Every character in unicode can be represented with 21 bits, but we don't 31 * Every character in unicode can be represented with 21 bits, but we don't
32 * have a datatype that's 24 bits long, so we return all code points as a 32 * have a datatype that's 24 bits long, so we return all code points as a
33 * 32 bit unsigned value represented by Bu::UtfChar. However, the UtfString 33 * 32 bit unsigned value represented by Bu::UtfChar. However, the UtfString
34 * class, for efficiency purposes doesn't store 32 bit values internally. 34 * class, for efficiency purposes doesn't store 32 bit values internally.
35 * It represents all code points in the native utf16 encodeng. This means 35 * It represents all code points in the native utf16 encodeng. This means
36 * that it may be very difficult to quickly determine the length of a 36 * that it may be very difficult to quickly determine the length of a
37 * UtfString in code points. Unlike many Unicode handling systems, this 37 * UtfString in code points. Unlike many Unicode handling systems, this
38 * one actually works with complete code points. When using this class you 38 * one actually works with complete code points. When using this class you
39 * don't ever have to know about the inner workings of the different 39 * don't ever have to know about the inner workings of the different
40 * encoding schemes. All of the data is dealt with as whole code points. 40 * encoding schemes. All of the data is dealt with as whole code points.
41 * 41 *
42 * As an aside, this means that when encoding a UtfString to a Utf16 42 * As an aside, this means that when encoding a UtfString to a Utf16
43 * encoding that matches your archetecture this operation will be very 43 * encoding that matches your archetecture this operation will be very
44 * fast since it will effectively be a raw dump of the internal data 44 * fast since it will effectively be a raw dump of the internal data
45 * structures. However, it is highly reccomended that you DO NOT use the 45 * structures. However, it is highly reccomended that you DO NOT use the
46 * little endian encodings if you can possibly avoid it. They are not 46 * little endian encodings if you can possibly avoid it. They are not
47 * reccomended by the Unicode Consortium and are mainly supported as a 47 * reccomended by the Unicode Consortium and are mainly supported as a
48 * means of communicating with other systems that encode their data 48 * means of communicating with other systems that encode their data
49 * incorrectly. That said, whenever UtfString encodes the contained string 49 * incorrectly. That said, whenever UtfString encodes the contained string
50 * it always includes a BOM at the begining (the byte order marker) so that 50 * it always includes a BOM at the begining (the byte order marker) so that
51 * proper byte order can be easily determined by the program reading the 51 * proper byte order can be easily determined by the program reading the
52 * data. 52 * data.
53 * 53 *
54 *@todo Investigate http://www.unicode.org/reports/tr6/ for compression. 54 *@todo Investigate http://www.unicode.org/reports/tr6/ for compression.
55 */ 55 */
56 class UtfString 56 class UtfString
57 { 57 {
58 public: 58 public:
59 enum Encoding 59 enum Encoding
60 { 60 {
61 Utf8, 61 Utf8,
62 Utf16, 62 Utf16,
63 Utf16be, 63 Utf16be,
64 Utf16le, 64 Utf16le,
65 Utf32, 65 Utf32,
66 Utf32be, 66 Utf32be,
67 Utf32le, 67 Utf32le,
68 Ucs2, 68 Ucs2,
69 Ucs4, 69 Ucs4,
70 GuessEncoding 70 GuessEncoding
71 }; 71 };
72 72
73 UtfString(); 73 UtfString();
74 UtfString( const Bu::String &sInput, Encoding eEnc=Utf8 ); 74 UtfString( const Bu::String &sInput, Encoding eEnc=Utf8 );
75 UtfString( const char *sInput, Encoding eEnc=Utf8 ); 75 UtfString( const char *sInput, Encoding eEnc=Utf8 );
76 virtual ~UtfString(); 76 virtual ~UtfString();
77 77
78 class iterator 78 class iterator
79 { 79 {
80 friend class UtfString; 80 friend class UtfString;
81 private: 81 private:
82 iterator( UtfString *pSrc, int iCodePos ) : 82 iterator( UtfString *pSrc, int iCodePos ) :
83 pSrc( pSrc ), iCodePos( iCodePos ) 83 pSrc( pSrc ), iCodePos( iCodePos )
84 { 84 {
85 } 85 }
86 86
87 public: 87 public:
88 iterator() : 88 iterator() :
89 pSrc( NULL ), iCodePos( 0 ) 89 pSrc( NULL ), iCodePos( 0 )
90 { 90 {
91 } 91 }
92 92
93 UtfChar operator*() 93 UtfChar operator*()
94 { 94 {
95 if( !pSrc ) 95 if( !pSrc )
96 throw Bu::ExceptionBase("invalid UtfString::iterator dereferenced."); 96 throw Bu::ExceptionBase("invalid UtfString::iterator dereferenced.");
97 return pSrc->get( iCodePos ); 97 return pSrc->get( iCodePos );
98 } 98 }
99 99
100 iterator operator++() 100 iterator operator++()
101 { 101 {
102 pSrc->nextChar( iCodePos ); 102 pSrc->nextChar( iCodePos );
103 return *this; 103 return *this;
104 } 104 }
105 105
106 iterator operator++( int ) 106 iterator operator++( int )
107 { 107 {
108 pSrc->nextChar( iCodePos ); 108 pSrc->nextChar( iCodePos );
109 return *this; 109 return *this;
110 } 110 }
111 111
112 operator bool() const 112 operator bool() const
113 { 113 {
114 return iCodePos < pSrc->aData.getSize(); 114 return iCodePos < pSrc->aData.getSize();
115 } 115 }
116 116
117 private: 117 private:
118 UtfString *pSrc; 118 UtfString *pSrc;
119 int iCodePos; 119 int iCodePos;
120 }; 120 };
121 121
122 class const_iterator 122 class const_iterator
123 { 123 {
124 friend class UtfString; 124 friend class UtfString;
125 private: 125 private:
126 const_iterator( const UtfString *pSrc, int iCodePos ) : 126 const_iterator( const UtfString *pSrc, int iCodePos ) :
127 pSrc( pSrc ), iCodePos( iCodePos ) 127 pSrc( pSrc ), iCodePos( iCodePos )
128 { 128 {
129 } 129 }
130 130
131 public: 131 public:
132 const_iterator() : 132 const_iterator() :
133 pSrc( NULL ), iCodePos( 0 ) 133 pSrc( NULL ), iCodePos( 0 )
134 { 134 {
135 } 135 }
136 136
137 UtfChar operator*() 137 UtfChar operator*()
138 { 138 {
139 if( !pSrc ) 139 if( !pSrc )
140 throw Bu::ExceptionBase("invalid UtfString::iterator dereferenced."); 140 throw Bu::ExceptionBase("invalid UtfString::iterator dereferenced.");
141 return pSrc->get( iCodePos ); 141 return pSrc->get( iCodePos );
142 } 142 }
143 143
144 const_iterator operator++() 144 const_iterator operator++()
145 { 145 {
146 pSrc->nextChar( iCodePos ); 146 pSrc->nextChar( iCodePos );
147 return *this; 147 return *this;
148 } 148 }
149 149
150 const_iterator operator++( int ) 150 const_iterator operator++( int )
151 { 151 {
152 pSrc->nextChar( iCodePos ); 152 pSrc->nextChar( iCodePos );
153 return *this; 153 return *this;
154 } 154 }
155 155
156 operator bool() const 156 operator bool() const
157 { 157 {
158 return iCodePos < pSrc->aData.getSize(); 158 return iCodePos < pSrc->aData.getSize();
159 } 159 }
160 160
161 private: 161 private:
162 const UtfString *pSrc; 162 const UtfString *pSrc;
163 int iCodePos; 163 int iCodePos;
164 }; 164 };
165 165
166 iterator begin(); 166 iterator begin();
167 const_iterator begin() const; 167 const_iterator begin() const;
168 168
169 /** 169 /**
170 * Append a UtfChar (A unicode code point) to the string. This can be 170 * Append a UtfChar (A unicode code point) to the string. This can be
171 * any valid code point, and is just the value of the code point, no 171 * any valid code point, and is just the value of the code point, no
172 * encoding necessary. 172 * encoding necessary.
173 */ 173 */
174 void append( UtfChar ch ); 174 void append( UtfChar ch );
175 175
176 void append( const UtfString &rSrc ); 176 void append( const UtfString &rSrc );
177 177
178 /** 178 /**
179 * Set the value of the entire string based on the given input and 179 * Set the value of the entire string based on the given input and
180 * encoding. The default encoding is Utf8, which is compatible with 180 * encoding. The default encoding is Utf8, which is compatible with
181 * 7-bit ascii, so it's a great choice for setting UtfStrings from 181 * 7-bit ascii, so it's a great choice for setting UtfStrings from
182 * string literals in code. 182 * string literals in code.
183 */ 183 */
184 void set( const Bu::String &sInput, Encoding eEnc=Utf8 ); 184 void set( const Bu::String &sInput, Encoding eEnc=Utf8 );
185 185
186 /** 186 /**
187 * This encodes the UtfString in the given encoding and outputs it to 187 * This encodes the UtfString in the given encoding and outputs it to
188 * the provided stream. all Utf16 and Utf32 encodings will have the 188 * the provided stream. all Utf16 and Utf32 encodings will have the
189 * correct BOM (byte order marker) at the begining. 189 * correct BOM (byte order marker) at the begining.
190 */ 190 */
191 void write( Bu::Stream &sOut, Encoding eEnc=Utf8 ) const; 191 void write( Bu::Stream &sOut, Encoding eEnc=Utf8 ) const;
192 192
193 /** 193 /**
194 * This encodes the UtfString in the given encoding and returns it as 194 * This encodes the UtfString in the given encoding and returns it as
195 * a binary Bu::String. Like write, this also includes the proper BOM 195 * a binary Bu::String. Like write, this also includes the proper BOM
196 * at the begining. 196 * at the begining.
197 */ 197 */
198 Bu::String get( Encoding eEnc=Utf8 ) const; 198 Bu::String get( Encoding eEnc=Utf8 ) const;
199 199
200 void debug() const; 200 void debug() const;
201 201
202 /** 202 /**
203 * This may or may not stick around, given an index, this returns a 203 * This may or may not stick around, given an index, this returns a
204 * codepoint, however there isn't necesarilly a 1:1 ratio between 204 * codepoint, however there isn't necesarilly a 1:1 ratio between
205 * indexes and code points. 205 * indexes and code points.
206 */ 206 */
207 UtfChar get( int iIndex ) const; 207 UtfChar get( int iIndex ) const;
208 208
209 /** 209 /**
210 * This is what to use if you want to iterate through a section of the 210 * This is what to use if you want to iterate through a section of the
211 * UtfString and you want to use a numerical index. In most cases it 211 * UtfString and you want to use a numerical index. In most cases it
212 * will be much easier to use an iterator, though. Given an index this 212 * will be much easier to use an iterator, though. Given an index this
213 * will return the codepoint at that position and increment iIndex an 213 * will return the codepoint at that position and increment iIndex an
214 * appropriate amount for it to point to the next code point. 214 * appropriate amount for it to point to the next code point.
215 */ 215 */
216 UtfChar nextChar( int &iIndex ) const; 216 UtfChar nextChar( int &iIndex ) const;
217 217
218 bool operator==( const Bu::UtfString &rhs ) const; 218 bool operator==( const Bu::UtfString &rhs ) const;
219 UtfString &operator+=( const Bu::UtfString &rhs ); 219 UtfString &operator+=( const Bu::UtfString &rhs );
220 UtfString &operator+=( const UtfChar &rhs ); 220 UtfString &operator+=( const UtfChar &rhs );
221 221
222 private: 222 private:
223 void append16( uint16_t i ) { aData.append( i ); } 223 void append16( uint16_t i ) { aData.append( i ); }
224 224
225 void setUtf8( const Bu::String &sInput ); 225 void setUtf8( const Bu::String &sInput );
226 void setUtf16( const Bu::String &sInput ); 226 void setUtf16( const Bu::String &sInput );
227 void setUtf16be( const Bu::String &sInput ); 227 void setUtf16be( const Bu::String &sInput );
228 void setUtf16le( const Bu::String &sInput ); 228 void setUtf16le( const Bu::String &sInput );
229 void setUtf32( const Bu::String &sInput ); 229 void setUtf32( const Bu::String &sInput );
230 void setUtf32be( const Bu::String &sInput ); 230 void setUtf32be( const Bu::String &sInput );
231 void setUtf32le( const Bu::String &sInput ); 231 void setUtf32le( const Bu::String &sInput );
232 232
233 void writeUtf8( Bu::Stream &sOut ) const; 233 void writeUtf8( Bu::Stream &sOut ) const;
234 void writeUtf16be( Bu::Stream &sOut ) const; 234 void writeUtf16be( Bu::Stream &sOut ) const;
235 void writeUtf16le( Bu::Stream &sOut ) const; 235 void writeUtf16le( Bu::Stream &sOut ) const;
236 void writeUtf32be( Bu::Stream &sOut ) const; 236 void writeUtf32be( Bu::Stream &sOut ) const;
237 void writeUtf32le( Bu::Stream &sOut ) const; 237 void writeUtf32le( Bu::Stream &sOut ) const;
238 238
239 private: 239 private:
240 Bu::Array<uint16_t> aData; 240 Bu::Array<uint16_t> aData;
241 int iRawLen; 241 int iRawLen;
242 int iCharLen; 242 int iCharLen;
243 }; 243 };
244 244
245 // 245 //
246 // Hash support 246 // Hash support
247 // 247 //
248 template<typename T> 248 template<typename T>
249 uint32_t __calcHashCode( const T &k ); 249 uint32_t __calcHashCode( const T &k );
250 250
251 template<typename T> 251 template<typename T>
252 bool __cmpHashKeys( const T &a, const T &b ); 252 bool __cmpHashKeys( const T &a, const T &b );
253 253
254 template<> uint32_t __calcHashCode<UtfString>( const UtfString &k ); 254 template<> uint32_t __calcHashCode<UtfString>( const UtfString &k );
255 template<> bool __cmpHashKeys<UtfString>( 255 template<> bool __cmpHashKeys<UtfString>(
256 const UtfString &a, const UtfString &b ); 256 const UtfString &a, const UtfString &b );
257}; 257};
258 258
259#endif 259#endif