summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorMike Buland <eichlan@xagasoft.com>2011-04-04 07:22:10 +0000
committerMike Buland <eichlan@xagasoft.com>2011-04-04 07:22:10 +0000
commitabbf45c1da7f3e3a542e6c6339a1bab31283f22e (patch)
tree1d40f79bbe315294507bb9bfedfbe2b01e815c1a /src
parentbc5fc82538f220f62f231d5bdda5910752156a32 (diff)
downloadlibbu++-abbf45c1da7f3e3a542e6c6339a1bab31283f22e.tar.gz
libbu++-abbf45c1da7f3e3a542e6c6339a1bab31283f22e.tar.bz2
libbu++-abbf45c1da7f3e3a542e6c6339a1bab31283f22e.tar.xz
libbu++-abbf45c1da7f3e3a542e6c6339a1bab31283f22e.zip
I made some awesome progress on the UtfString system, it stores in native utf16
encoding to make things easier (little endian in our case). It can currently read utf8 and utf16be, but not BOM. It will give you full unicode code points instead of the raw utf16 values, which is pretty slick.
Diffstat (limited to 'src')
-rw-r--r--src/config.h3
-rw-r--r--src/tests/utf.cpp3
-rw-r--r--src/utfstring.cpp143
-rw-r--r--src/utfstring.h20
4 files changed, 164 insertions, 5 deletions
diff --git a/src/config.h b/src/config.h
index 3046b59..ce954de 100644
--- a/src/config.h
+++ b/src/config.h
@@ -17,4 +17,7 @@
17 17
18#include "bu/extratypes.h" 18#include "bu/extratypes.h"
19 19
20// Later if we need autoconfig stuff, here's where it'll go.
21// #include "bu/autoconfig.h"
22
20#endif 23#endif
diff --git a/src/tests/utf.cpp b/src/tests/utf.cpp
index 59d49c6..9e075e2 100644
--- a/src/tests/utf.cpp
+++ b/src/tests/utf.cpp
@@ -16,7 +16,8 @@ int main( int argc, char *argv[] )
16 int iAmnt = fIn.read( buf, 4096 ); 16 int iAmnt = fIn.read( buf, 4096 );
17 sUtf8.append( buf, iAmnt ); 17 sUtf8.append( buf, iAmnt );
18 } 18 }
19 Bu::UtfString::debugUtf8( sUtf8 ); 19 Bu::UtfString us( sUtf8, Bu::UtfString::Utf16 );
20 us.debug();
20 } 21 }
21} 22}
22 23
diff --git a/src/utfstring.cpp b/src/utfstring.cpp
index 0e2060b..bb0a011 100644
--- a/src/utfstring.cpp
+++ b/src/utfstring.cpp
@@ -9,17 +9,156 @@
9 9
10#include "bu/string.h" 10#include "bu/string.h"
11 11
12#include <endian.h>
13
12Bu::UtfString::UtfString() 14Bu::UtfString::UtfString()
13{ 15{
14} 16}
15 17
18Bu::UtfString::UtfString( const Bu::String &sInput, Encoding eEnc )
19{
20 set( sInput, eEnc );
21}
22
16Bu::UtfString::~UtfString() 23Bu::UtfString::~UtfString()
17{ 24{
18} 25}
19 26
27void Bu::UtfString::set( const Bu::String &sInput, Encoding eEnc )
28{
29 switch( eEnc )
30 {
31 case Utf8:
32 setUtf8( sInput );
33 break;
34
35 case Utf16:
36 case Utf16be:
37 setUtf16( sInput );
38 break;
39
40 case Utf16le:
41 throw Bu::ExceptionBase("Utf16le not supported yet.");
42 break;
43
44 case Utf32:
45 throw Bu::ExceptionBase("Utf32 not supported yet.");
46 break;
47
48 case Ucs16:
49 throw Bu::ExceptionBase("Ucs16 not supported yet.");
50 break;
51
52 case GuessEncoding:
53 throw Bu::ExceptionBase("Guessing mode not supported yet.");
54 break;
55 }
56}
57
58void Bu::UtfString::append( UtfChar ch )
59{
60 if( ch >= 0x10000 )
61 {
62 ch -= 0x10000;
63 append16( ((ch>>10)&0x3FF)| 0xD800u );
64 append16( (ch&0x3FF)| 0xDC00u );
65 }
66 else
67 {
68 append16( (uint16_t)(ch) );
69 }
70}
71
72void Bu::UtfString::setUtf8( const Bu::String &sInput )
73{
74 static uint8_t lmask[8] = {
75 0x00,
76 0x01,
77 0x03,
78 0x07,
79 0x0f,
80 0x1f,
81 0x3f,
82 0x7f
83 };
84 for( Bu::String::const_iterator i = sInput.begin(); i; i++ )
85 {
86 if( ((int)(uint8_t)*i)&0x80 )
87 {
88 int iBytes = 1;
89 for(; (((uint8_t)(*i))<<iBytes)&0x80; iBytes++ ) { }
90 Bu::UtfChar uPt = ((*i) & lmask[7-iBytes])<<(6*(iBytes-1));
91 for( iBytes--; iBytes >= 1; iBytes-- )
92 {
93 i++;
94 uPt |= ((*i)&lmask[6])<<(6*(iBytes-1));
95 }
96 append( uPt );
97 }
98 else
99 {
100 append( (Bu::UtfChar)(*i) );
101 }
102 }
103}
104
105void Bu::UtfString::setUtf16( const Bu::String &sInput )
106{
107 uint16_t hi, lo;
108 for( Bu::String::const_iterator i = sInput.begin(); i; i++ )
109 {
110 hi = (((uint8_t)*i)<<8) | ((uint8_t)*(++i));
111 append16( hi );
112 if( (hi&0xD800u) == 0xD800u )
113 {
114 lo = (((uint8_t)*(++i))<<8) | ((uint8_t)*(++i));
115 append16( lo );
116 }
117 }
118}
119
20#include "bu/sio.h" 120#include "bu/sio.h"
21using Bu::sio; 121using Bu::sio;
22 122
123Bu::UtfChar Bu::UtfString::get( int iIndex )
124{
125 Bu::UtfChar i = aData[iIndex];
126 switch( i&0xFC00 )
127 {
128 case 0xD800:
129 sio << "(hi) ";
130 return (((i&0x3FF)<<10) | ((aData[iIndex+1]&0x3FF)))+0x10000;
131
132 case 0xDC00:
133 sio << "(lo) ";
134 return 0;
135
136 default:
137 sio << "(--) ";
138 return i&0xFC00;
139 }
140}
141
142void Bu::UtfString::debug()
143{
144 sio << "Raw Utf16: ";
145 for( int i = 0; i < aData.getSize(); i++ )
146 {
147 if( i > 0 )
148 sio << ", ";
149 sio << "0x" << Fmt::hex() << aData[i];
150 }
151 sio << sio.nl;
152 sio << "Code Points: ";
153 for( int i = 0; i < aData.getSize(); i++ )
154 {
155 if( i > 0 )
156 sio << ", ";
157 sio << "0x" << Fmt::hex() << get( i );
158 }
159 sio << sio.nl;
160}
161/*
23void Bu::UtfString::debugUtf8( const Bu::String &sUtf8 ) 162void Bu::UtfString::debugUtf8( const Bu::String &sUtf8 )
24{ 163{
25 static uint8_t lmask[8] = { 164 static uint8_t lmask[8] = {
@@ -43,7 +182,7 @@ void Bu::UtfString::debugUtf8( const Bu::String &sUtf8 )
43 int iBytes = 1; 182 int iBytes = 1;
44 for(; (((uint8_t)(*i))<<iBytes)&0x80; iBytes++ ) { } 183 for(; (((uint8_t)(*i))<<iBytes)&0x80; iBytes++ ) { }
45// sio << "iBytes = " << iBytes << sio.nl; 184// sio << "iBytes = " << iBytes << sio.nl;
46 point uPt = ((*i) & lmask[7-iBytes])<<(6*(iBytes-1)); 185 Bu::UtfChar uPt = ((*i) & lmask[7-iBytes])<<(6*(iBytes-1));
47// sio << "mask: " << Bu::Fmt().radix(2).width(8).fill('0') 186// sio << "mask: " << Bu::Fmt().radix(2).width(8).fill('0')
48// << (int)lmask[7-iBytes] << sio.nl; 187// << (int)lmask[7-iBytes] << sio.nl;
49 for( iBytes--; iBytes >= 1; iBytes-- ) 188 for( iBytes--; iBytes >= 1; iBytes-- )
@@ -68,4 +207,4 @@ void Bu::UtfString::debugUtf8( const Bu::String &sUtf8 )
68 } 207 }
69 sio << sio.nl; 208 sio << sio.nl;
70} 209}
71 210*/
diff --git a/src/utfstring.h b/src/utfstring.h
index 6f85e93..79ef62e 100644
--- a/src/utfstring.h
+++ b/src/utfstring.h
@@ -9,9 +9,12 @@
9#define BU_UTF_STRING_H 9#define BU_UTF_STRING_H
10 10
11#include <stdint.h> 11#include <stdint.h>
12#include "bu/array.h"
12 13
13namespace Bu 14namespace Bu
14{ 15{
16 class String;
17
15 /** 18 /**
16 * UtfChar isn't actually a character, unicode specifies "code points" not 19 * UtfChar isn't actually a character, unicode specifies "code points" not
17 * characters. The main reason for this is that not all code points define 20 * characters. The main reason for this is that not all code points define
@@ -40,10 +43,23 @@ namespace Bu
40 UtfString( const Bu::String &sInput, Encoding eEnc=Utf8 ); 43 UtfString( const Bu::String &sInput, Encoding eEnc=Utf8 );
41 virtual ~UtfString(); 44 virtual ~UtfString();
42 45
43 static void debugUtf8( const Bu::String &sUtf8 ); 46 void append( UtfChar ch );
47
48 void set( const Bu::String &sInput, Encoding eEnc=Utf8 );
49 void setUtf8( const Bu::String &sInput );
50 void setUtf16( const Bu::String &sInput );
51// void setUtf16be( const Bu::String &sInput );
52// void setUtf16le( const Bu::String &sInput );
53
54 void debug();
55
56 UtfChar get( int iIndex );
57
58 private:
59 void append16( uint16_t i ) { aData.append( i ); }
44 60
45 private: 61 private:
46 uint16_t *pData; 62 Bu::Array<uint16_t> aData;
47 int iRawLen; 63 int iRawLen;
48 int iCharLen; 64 int iCharLen;
49 }; 65 };