summaryrefslogtreecommitdiff
path: root/src/unstable/utfstring.h
diff options
context:
space:
mode:
authorMike Buland <eichlan@xagasoft.com>2012-03-25 20:00:08 +0000
committerMike Buland <eichlan@xagasoft.com>2012-03-25 20:00:08 +0000
commit469bbcf0701e1eb8a6670c23145b0da87357e178 (patch)
treeb5b062a16e46a6c5d3410b4e574cd0cc09057211 /src/unstable/utfstring.h
parentee1b79396076edc4e30aefb285fada03bb45e80d (diff)
downloadlibbu++-469bbcf0701e1eb8a6670c23145b0da87357e178.tar.gz
libbu++-469bbcf0701e1eb8a6670c23145b0da87357e178.tar.bz2
libbu++-469bbcf0701e1eb8a6670c23145b0da87357e178.tar.xz
libbu++-469bbcf0701e1eb8a6670c23145b0da87357e178.zip
Code is all reorganized. We're about ready to release. I should write up a
little explenation of the arrangement.
Diffstat (limited to 'src/unstable/utfstring.h')
-rw-r--r--src/unstable/utfstring.h174
1 files changed, 174 insertions, 0 deletions
diff --git a/src/unstable/utfstring.h b/src/unstable/utfstring.h
new file mode 100644
index 0000000..477e272
--- /dev/null
+++ b/src/unstable/utfstring.h
@@ -0,0 +1,174 @@
1/*
2 * Copyright (C) 2007-2011 Xagasoft, All rights reserved.
3 *
4 * This file is part of the libbu++ library and is released under the
5 * terms of the license contained in the file LICENSE.
6 */
7
8#ifndef BU_UTF_STRING_H
9#define BU_UTF_STRING_H
10
11#include <stdint.h>
12#include "bu/array.h"
13
14namespace Bu
15{
16 class String;
17 class Stream;
18
19 /**
20 * UtfChar isn't actually a character, unicode specifies "code points" not
21 * characters. The main reason for this is that not all code points define
22 * usable characters. Some control text directionality, some apply
23 * properties to other code points which are characters. However, most of
24 * these distinctions are only important when implementing displays that
25 * comply with the Unicode standard fully.
26 */
27 typedef uint32_t UtfChar;
28
29 /**
30 * A unicode string. This class represents a string of unicode code points.
31 * Every character in unicode can be represented with 21 bits, but we don't
32 * have a datatype that's 24 bits long, so we return all code points as a
33 * 32 bit unsigned value represented by Bu::UtfChar. However, the UtfString
34 * class, for efficiency purposes doesn't store 32 bit values internally.
35 * It represents all code points in the native utf16 encodeng. This means
36 * that it may be very difficult to quickly determine the length of a
37 * UtfString in code points. Unlike many Unicode handling systems, this
38 * one actually works with complete code points. When using this class you
39 * don't ever have to know about the inner workings of the different
40 * encoding schemes. All of the data is dealt with as whole code points.
41 *
42 * As an aside, this means that when encoding a UtfString to a Utf16
43 * encoding that matches your archetecture this operation will be very
44 * fast since it will effectively be a raw dump of the internal data
45 * structures. However, it is highly reccomended that you DO NOT use the
46 * little endian encodings if you can possibly avoid it. They are not
47 * reccomended by the Unicode Consortium and are mainly supported as a
48 * means of communicating with other systems that encode their data
49 * incorrectly. That said, whenever UtfString encodes the contained string
50 * it always includes a BOM at the begining (the byte order marker) so that
51 * proper byte order can be easily determined by the program reading the
52 * data.
53 *
54 *@todo Investigate http://www.unicode.org/reports/tr6/ for compression.
55 */
56 class UtfString
57 {
58 public:
59 enum Encoding
60 {
61 Utf8,
62 Utf16,
63 Utf16be,
64 Utf16le,
65 Utf32,
66 Utf32be,
67 Utf32le,
68 Ucs2,
69 Ucs4,
70 GuessEncoding
71 };
72
73 UtfString();
74 UtfString( const Bu::String &sInput, Encoding eEnc=Utf8 );
75 virtual ~UtfString();
76
77 class iterator
78 {
79 private:
80 iterator( UtfString *pSrc, int iCodePos ) :
81 pSrc( pSrc ), iCodePos( iCodePos )
82 {
83 }
84
85 public:
86 iterator() :
87 pSrc( NULL ), iCodePos( 0 )
88 {
89 }
90
91 UtfChar operator*()
92 {
93 if( !pSrc )
94 throw Bu::ExceptionBase("invalid UtfString::iterator dereferenced.");
95 return pSrc->nextChar( iCodePos );
96 }
97
98 private:
99 UtfString *pSrc;
100 int iCodePos;
101 };
102
103 /**
104 * Append a UtfChar (A unicode code point) to the string. This can be
105 * any valid code point, and is just the value of the code point, no
106 * encoding necessary.
107 */
108 void append( UtfChar ch );
109
110 /**
111 * Set the value of the entire string based on the given input and
112 * encoding. The default encoding is Utf8, which is compatible with
113 * 7-bit ascii, so it's a great choice for setting UtfStrings from
114 * string literals in code.
115 */
116 void set( const Bu::String &sInput, Encoding eEnc=Utf8 );
117
118 /**
119 * This encodes the UtfString in the given encoding and outputs it to
120 * the provided stream. all Utf16 and Utf32 encodings will have the
121 * correct BOM (byte order marker) at the begining.
122 */
123 void write( Bu::Stream &sOut, Encoding eEnc=Utf8 );
124
125 /**
126 * This encodes the UtfString in the given encoding and returns it as
127 * a binary Bu::String. Like write, this also includes the proper BOM
128 * at the begining.
129 */
130 Bu::String get( Encoding eEnc=Utf8 );
131
132 void debug();
133
134 /**
135 * This may or may not stick around, given an index, this returns a
136 * codepoint, however there isn't necesarilly a 1:1 ratio between
137 * indexes and code points.
138 */
139 UtfChar get( int iIndex );
140
141 /**
142 * This is what to use if you want to iterate through a section of the
143 * UtfString and you want to use a numerical index. In most cases it
144 * will be much easier to use an iterator, though. Given an index this
145 * will return the codepoint at that position and increment iIndex an
146 * appropriate amount for it to point to the next code point.
147 */
148 UtfChar nextChar( int &iIndex );
149
150 private:
151 void append16( uint16_t i ) { aData.append( i ); }
152
153 void setUtf8( const Bu::String &sInput );
154 void setUtf16( const Bu::String &sInput );
155 void setUtf16be( const Bu::String &sInput );
156 void setUtf16le( const Bu::String &sInput );
157 void setUtf32( const Bu::String &sInput );
158 void setUtf32be( const Bu::String &sInput );
159 void setUtf32le( const Bu::String &sInput );
160
161 void writeUtf8( Bu::Stream &sOut );
162 void writeUtf16be( Bu::Stream &sOut );
163 void writeUtf16le( Bu::Stream &sOut );
164 void writeUtf32be( Bu::Stream &sOut );
165 void writeUtf32le( Bu::Stream &sOut );
166
167 private:
168 Bu::Array<uint16_t> aData;
169 int iRawLen;
170 int iCharLen;
171 };
172};
173
174#endif