diff options
Diffstat (limited to 'src')
-rw-r--r-- | src/config.h | 3 | ||||
-rw-r--r-- | src/tests/utf.cpp | 3 | ||||
-rw-r--r-- | src/utfstring.cpp | 143 | ||||
-rw-r--r-- | src/utfstring.h | 20 |
4 files changed, 164 insertions, 5 deletions
diff --git a/src/config.h b/src/config.h index 3046b59..ce954de 100644 --- a/src/config.h +++ b/src/config.h | |||
@@ -17,4 +17,7 @@ | |||
17 | 17 | ||
18 | #include "bu/extratypes.h" | 18 | #include "bu/extratypes.h" |
19 | 19 | ||
20 | // Later if we need autoconfig stuff, here's where it'll go. | ||
21 | // #include "bu/autoconfig.h" | ||
22 | |||
20 | #endif | 23 | #endif |
diff --git a/src/tests/utf.cpp b/src/tests/utf.cpp index 59d49c6..9e075e2 100644 --- a/src/tests/utf.cpp +++ b/src/tests/utf.cpp | |||
@@ -16,7 +16,8 @@ int main( int argc, char *argv[] ) | |||
16 | int iAmnt = fIn.read( buf, 4096 ); | 16 | int iAmnt = fIn.read( buf, 4096 ); |
17 | sUtf8.append( buf, iAmnt ); | 17 | sUtf8.append( buf, iAmnt ); |
18 | } | 18 | } |
19 | Bu::UtfString::debugUtf8( sUtf8 ); | 19 | Bu::UtfString us( sUtf8, Bu::UtfString::Utf16 ); |
20 | us.debug(); | ||
20 | } | 21 | } |
21 | } | 22 | } |
22 | 23 | ||
diff --git a/src/utfstring.cpp b/src/utfstring.cpp index 0e2060b..bb0a011 100644 --- a/src/utfstring.cpp +++ b/src/utfstring.cpp | |||
@@ -9,17 +9,156 @@ | |||
9 | 9 | ||
10 | #include "bu/string.h" | 10 | #include "bu/string.h" |
11 | 11 | ||
12 | #include <endian.h> | ||
13 | |||
12 | Bu::UtfString::UtfString() | 14 | Bu::UtfString::UtfString() |
13 | { | 15 | { |
14 | } | 16 | } |
15 | 17 | ||
18 | Bu::UtfString::UtfString( const Bu::String &sInput, Encoding eEnc ) | ||
19 | { | ||
20 | set( sInput, eEnc ); | ||
21 | } | ||
22 | |||
16 | Bu::UtfString::~UtfString() | 23 | Bu::UtfString::~UtfString() |
17 | { | 24 | { |
18 | } | 25 | } |
19 | 26 | ||
27 | void Bu::UtfString::set( const Bu::String &sInput, Encoding eEnc ) | ||
28 | { | ||
29 | switch( eEnc ) | ||
30 | { | ||
31 | case Utf8: | ||
32 | setUtf8( sInput ); | ||
33 | break; | ||
34 | |||
35 | case Utf16: | ||
36 | case Utf16be: | ||
37 | setUtf16( sInput ); | ||
38 | break; | ||
39 | |||
40 | case Utf16le: | ||
41 | throw Bu::ExceptionBase("Utf16le not supported yet."); | ||
42 | break; | ||
43 | |||
44 | case Utf32: | ||
45 | throw Bu::ExceptionBase("Utf32 not supported yet."); | ||
46 | break; | ||
47 | |||
48 | case Ucs16: | ||
49 | throw Bu::ExceptionBase("Ucs16 not supported yet."); | ||
50 | break; | ||
51 | |||
52 | case GuessEncoding: | ||
53 | throw Bu::ExceptionBase("Guessing mode not supported yet."); | ||
54 | break; | ||
55 | } | ||
56 | } | ||
57 | |||
58 | void Bu::UtfString::append( UtfChar ch ) | ||
59 | { | ||
60 | if( ch >= 0x10000 ) | ||
61 | { | ||
62 | ch -= 0x10000; | ||
63 | append16( ((ch>>10)&0x3FF)| 0xD800u ); | ||
64 | append16( (ch&0x3FF)| 0xDC00u ); | ||
65 | } | ||
66 | else | ||
67 | { | ||
68 | append16( (uint16_t)(ch) ); | ||
69 | } | ||
70 | } | ||
71 | |||
72 | void Bu::UtfString::setUtf8( const Bu::String &sInput ) | ||
73 | { | ||
74 | static uint8_t lmask[8] = { | ||
75 | 0x00, | ||
76 | 0x01, | ||
77 | 0x03, | ||
78 | 0x07, | ||
79 | 0x0f, | ||
80 | 0x1f, | ||
81 | 0x3f, | ||
82 | 0x7f | ||
83 | }; | ||
84 | for( Bu::String::const_iterator i = sInput.begin(); i; i++ ) | ||
85 | { | ||
86 | if( ((int)(uint8_t)*i)&0x80 ) | ||
87 | { | ||
88 | int iBytes = 1; | ||
89 | for(; (((uint8_t)(*i))<<iBytes)&0x80; iBytes++ ) { } | ||
90 | Bu::UtfChar uPt = ((*i) & lmask[7-iBytes])<<(6*(iBytes-1)); | ||
91 | for( iBytes--; iBytes >= 1; iBytes-- ) | ||
92 | { | ||
93 | i++; | ||
94 | uPt |= ((*i)&lmask[6])<<(6*(iBytes-1)); | ||
95 | } | ||
96 | append( uPt ); | ||
97 | } | ||
98 | else | ||
99 | { | ||
100 | append( (Bu::UtfChar)(*i) ); | ||
101 | } | ||
102 | } | ||
103 | } | ||
104 | |||
105 | void Bu::UtfString::setUtf16( const Bu::String &sInput ) | ||
106 | { | ||
107 | uint16_t hi, lo; | ||
108 | for( Bu::String::const_iterator i = sInput.begin(); i; i++ ) | ||
109 | { | ||
110 | hi = (((uint8_t)*i)<<8) | ((uint8_t)*(++i)); | ||
111 | append16( hi ); | ||
112 | if( (hi&0xD800u) == 0xD800u ) | ||
113 | { | ||
114 | lo = (((uint8_t)*(++i))<<8) | ((uint8_t)*(++i)); | ||
115 | append16( lo ); | ||
116 | } | ||
117 | } | ||
118 | } | ||
119 | |||
20 | #include "bu/sio.h" | 120 | #include "bu/sio.h" |
21 | using Bu::sio; | 121 | using Bu::sio; |
22 | 122 | ||
123 | Bu::UtfChar Bu::UtfString::get( int iIndex ) | ||
124 | { | ||
125 | Bu::UtfChar i = aData[iIndex]; | ||
126 | switch( i&0xFC00 ) | ||
127 | { | ||
128 | case 0xD800: | ||
129 | sio << "(hi) "; | ||
130 | return (((i&0x3FF)<<10) | ((aData[iIndex+1]&0x3FF)))+0x10000; | ||
131 | |||
132 | case 0xDC00: | ||
133 | sio << "(lo) "; | ||
134 | return 0; | ||
135 | |||
136 | default: | ||
137 | sio << "(--) "; | ||
138 | return i&0xFC00; | ||
139 | } | ||
140 | } | ||
141 | |||
142 | void Bu::UtfString::debug() | ||
143 | { | ||
144 | sio << "Raw Utf16: "; | ||
145 | for( int i = 0; i < aData.getSize(); i++ ) | ||
146 | { | ||
147 | if( i > 0 ) | ||
148 | sio << ", "; | ||
149 | sio << "0x" << Fmt::hex() << aData[i]; | ||
150 | } | ||
151 | sio << sio.nl; | ||
152 | sio << "Code Points: "; | ||
153 | for( int i = 0; i < aData.getSize(); i++ ) | ||
154 | { | ||
155 | if( i > 0 ) | ||
156 | sio << ", "; | ||
157 | sio << "0x" << Fmt::hex() << get( i ); | ||
158 | } | ||
159 | sio << sio.nl; | ||
160 | } | ||
161 | /* | ||
23 | void Bu::UtfString::debugUtf8( const Bu::String &sUtf8 ) | 162 | void Bu::UtfString::debugUtf8( const Bu::String &sUtf8 ) |
24 | { | 163 | { |
25 | static uint8_t lmask[8] = { | 164 | static uint8_t lmask[8] = { |
@@ -43,7 +182,7 @@ void Bu::UtfString::debugUtf8( const Bu::String &sUtf8 ) | |||
43 | int iBytes = 1; | 182 | int iBytes = 1; |
44 | for(; (((uint8_t)(*i))<<iBytes)&0x80; iBytes++ ) { } | 183 | for(; (((uint8_t)(*i))<<iBytes)&0x80; iBytes++ ) { } |
45 | // sio << "iBytes = " << iBytes << sio.nl; | 184 | // sio << "iBytes = " << iBytes << sio.nl; |
46 | point uPt = ((*i) & lmask[7-iBytes])<<(6*(iBytes-1)); | 185 | Bu::UtfChar uPt = ((*i) & lmask[7-iBytes])<<(6*(iBytes-1)); |
47 | // sio << "mask: " << Bu::Fmt().radix(2).width(8).fill('0') | 186 | // sio << "mask: " << Bu::Fmt().radix(2).width(8).fill('0') |
48 | // << (int)lmask[7-iBytes] << sio.nl; | 187 | // << (int)lmask[7-iBytes] << sio.nl; |
49 | for( iBytes--; iBytes >= 1; iBytes-- ) | 188 | for( iBytes--; iBytes >= 1; iBytes-- ) |
@@ -68,4 +207,4 @@ void Bu::UtfString::debugUtf8( const Bu::String &sUtf8 ) | |||
68 | } | 207 | } |
69 | sio << sio.nl; | 208 | sio << sio.nl; |
70 | } | 209 | } |
71 | 210 | */ | |
diff --git a/src/utfstring.h b/src/utfstring.h index 6f85e93..79ef62e 100644 --- a/src/utfstring.h +++ b/src/utfstring.h | |||
@@ -9,9 +9,12 @@ | |||
9 | #define BU_UTF_STRING_H | 9 | #define BU_UTF_STRING_H |
10 | 10 | ||
11 | #include <stdint.h> | 11 | #include <stdint.h> |
12 | #include "bu/array.h" | ||
12 | 13 | ||
13 | namespace Bu | 14 | namespace Bu |
14 | { | 15 | { |
16 | class String; | ||
17 | |||
15 | /** | 18 | /** |
16 | * UtfChar isn't actually a character, unicode specifies "code points" not | 19 | * UtfChar isn't actually a character, unicode specifies "code points" not |
17 | * characters. The main reason for this is that not all code points define | 20 | * characters. The main reason for this is that not all code points define |
@@ -40,10 +43,23 @@ namespace Bu | |||
40 | UtfString( const Bu::String &sInput, Encoding eEnc=Utf8 ); | 43 | UtfString( const Bu::String &sInput, Encoding eEnc=Utf8 ); |
41 | virtual ~UtfString(); | 44 | virtual ~UtfString(); |
42 | 45 | ||
43 | static void debugUtf8( const Bu::String &sUtf8 ); | 46 | void append( UtfChar ch ); |
47 | |||
48 | void set( const Bu::String &sInput, Encoding eEnc=Utf8 ); | ||
49 | void setUtf8( const Bu::String &sInput ); | ||
50 | void setUtf16( const Bu::String &sInput ); | ||
51 | // void setUtf16be( const Bu::String &sInput ); | ||
52 | // void setUtf16le( const Bu::String &sInput ); | ||
53 | |||
54 | void debug(); | ||
55 | |||
56 | UtfChar get( int iIndex ); | ||
57 | |||
58 | private: | ||
59 | void append16( uint16_t i ) { aData.append( i ); } | ||
44 | 60 | ||
45 | private: | 61 | private: |
46 | uint16_t *pData; | 62 | Bu::Array<uint16_t> aData; |
47 | int iRawLen; | 63 | int iRawLen; |
48 | int iCharLen; | 64 | int iCharLen; |
49 | }; | 65 | }; |