summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/config.h3
-rw-r--r--src/tests/utf.cpp3
-rw-r--r--src/utfstring.cpp143
-rw-r--r--src/utfstring.h20
4 files changed, 164 insertions, 5 deletions
diff --git a/src/config.h b/src/config.h
index 3046b59..ce954de 100644
--- a/src/config.h
+++ b/src/config.h
@@ -17,4 +17,7 @@
17 17
18#include "bu/extratypes.h" 18#include "bu/extratypes.h"
19 19
20// Later if we need autoconfig stuff, here's where it'll go.
21// #include "bu/autoconfig.h"
22
20#endif 23#endif
diff --git a/src/tests/utf.cpp b/src/tests/utf.cpp
index 59d49c6..9e075e2 100644
--- a/src/tests/utf.cpp
+++ b/src/tests/utf.cpp
@@ -16,7 +16,8 @@ int main( int argc, char *argv[] )
16 int iAmnt = fIn.read( buf, 4096 ); 16 int iAmnt = fIn.read( buf, 4096 );
17 sUtf8.append( buf, iAmnt ); 17 sUtf8.append( buf, iAmnt );
18 } 18 }
19 Bu::UtfString::debugUtf8( sUtf8 ); 19 Bu::UtfString us( sUtf8, Bu::UtfString::Utf16 );
20 us.debug();
20 } 21 }
21} 22}
22 23
diff --git a/src/utfstring.cpp b/src/utfstring.cpp
index 0e2060b..bb0a011 100644
--- a/src/utfstring.cpp
+++ b/src/utfstring.cpp
@@ -9,17 +9,156 @@
9 9
10#include "bu/string.h" 10#include "bu/string.h"
11 11
12#include <endian.h>
13
12Bu::UtfString::UtfString() 14Bu::UtfString::UtfString()
13{ 15{
14} 16}
15 17
18Bu::UtfString::UtfString( const Bu::String &sInput, Encoding eEnc )
19{
20 set( sInput, eEnc );
21}
22
16Bu::UtfString::~UtfString() 23Bu::UtfString::~UtfString()
17{ 24{
18} 25}
19 26
27void Bu::UtfString::set( const Bu::String &sInput, Encoding eEnc )
28{
29 switch( eEnc )
30 {
31 case Utf8:
32 setUtf8( sInput );
33 break;
34
35 case Utf16:
36 case Utf16be:
37 setUtf16( sInput );
38 break;
39
40 case Utf16le:
41 throw Bu::ExceptionBase("Utf16le not supported yet.");
42 break;
43
44 case Utf32:
45 throw Bu::ExceptionBase("Utf32 not supported yet.");
46 break;
47
48 case Ucs16:
49 throw Bu::ExceptionBase("Ucs16 not supported yet.");
50 break;
51
52 case GuessEncoding:
53 throw Bu::ExceptionBase("Guessing mode not supported yet.");
54 break;
55 }
56}
57
58void Bu::UtfString::append( UtfChar ch )
59{
60 if( ch >= 0x10000 )
61 {
62 ch -= 0x10000;
63 append16( ((ch>>10)&0x3FF)| 0xD800u );
64 append16( (ch&0x3FF)| 0xDC00u );
65 }
66 else
67 {
68 append16( (uint16_t)(ch) );
69 }
70}
71
72void Bu::UtfString::setUtf8( const Bu::String &sInput )
73{
74 static uint8_t lmask[8] = {
75 0x00,
76 0x01,
77 0x03,
78 0x07,
79 0x0f,
80 0x1f,
81 0x3f,
82 0x7f
83 };
84 for( Bu::String::const_iterator i = sInput.begin(); i; i++ )
85 {
86 if( ((int)(uint8_t)*i)&0x80 )
87 {
88 int iBytes = 1;
89 for(; (((uint8_t)(*i))<<iBytes)&0x80; iBytes++ ) { }
90 Bu::UtfChar uPt = ((*i) & lmask[7-iBytes])<<(6*(iBytes-1));
91 for( iBytes--; iBytes >= 1; iBytes-- )
92 {
93 i++;
94 uPt |= ((*i)&lmask[6])<<(6*(iBytes-1));
95 }
96 append( uPt );
97 }
98 else
99 {
100 append( (Bu::UtfChar)(*i) );
101 }
102 }
103}
104
105void Bu::UtfString::setUtf16( const Bu::String &sInput )
106{
107 uint16_t hi, lo;
108 for( Bu::String::const_iterator i = sInput.begin(); i; i++ )
109 {
110 hi = (((uint8_t)*i)<<8) | ((uint8_t)*(++i));
111 append16( hi );
112 if( (hi&0xD800u) == 0xD800u )
113 {
114 lo = (((uint8_t)*(++i))<<8) | ((uint8_t)*(++i));
115 append16( lo );
116 }
117 }
118}
119
20#include "bu/sio.h" 120#include "bu/sio.h"
21using Bu::sio; 121using Bu::sio;
22 122
123Bu::UtfChar Bu::UtfString::get( int iIndex )
124{
125 Bu::UtfChar i = aData[iIndex];
126 switch( i&0xFC00 )
127 {
128 case 0xD800:
129 sio << "(hi) ";
130 return (((i&0x3FF)<<10) | ((aData[iIndex+1]&0x3FF)))+0x10000;
131
132 case 0xDC00:
133 sio << "(lo) ";
134 return 0;
135
136 default:
137 sio << "(--) ";
138 return i&0xFC00;
139 }
140}
141
142void Bu::UtfString::debug()
143{
144 sio << "Raw Utf16: ";
145 for( int i = 0; i < aData.getSize(); i++ )
146 {
147 if( i > 0 )
148 sio << ", ";
149 sio << "0x" << Fmt::hex() << aData[i];
150 }
151 sio << sio.nl;
152 sio << "Code Points: ";
153 for( int i = 0; i < aData.getSize(); i++ )
154 {
155 if( i > 0 )
156 sio << ", ";
157 sio << "0x" << Fmt::hex() << get( i );
158 }
159 sio << sio.nl;
160}
161/*
23void Bu::UtfString::debugUtf8( const Bu::String &sUtf8 ) 162void Bu::UtfString::debugUtf8( const Bu::String &sUtf8 )
24{ 163{
25 static uint8_t lmask[8] = { 164 static uint8_t lmask[8] = {
@@ -43,7 +182,7 @@ void Bu::UtfString::debugUtf8( const Bu::String &sUtf8 )
43 int iBytes = 1; 182 int iBytes = 1;
44 for(; (((uint8_t)(*i))<<iBytes)&0x80; iBytes++ ) { } 183 for(; (((uint8_t)(*i))<<iBytes)&0x80; iBytes++ ) { }
45// sio << "iBytes = " << iBytes << sio.nl; 184// sio << "iBytes = " << iBytes << sio.nl;
46 point uPt = ((*i) & lmask[7-iBytes])<<(6*(iBytes-1)); 185 Bu::UtfChar uPt = ((*i) & lmask[7-iBytes])<<(6*(iBytes-1));
47// sio << "mask: " << Bu::Fmt().radix(2).width(8).fill('0') 186// sio << "mask: " << Bu::Fmt().radix(2).width(8).fill('0')
48// << (int)lmask[7-iBytes] << sio.nl; 187// << (int)lmask[7-iBytes] << sio.nl;
49 for( iBytes--; iBytes >= 1; iBytes-- ) 188 for( iBytes--; iBytes >= 1; iBytes-- )
@@ -68,4 +207,4 @@ void Bu::UtfString::debugUtf8( const Bu::String &sUtf8 )
68 } 207 }
69 sio << sio.nl; 208 sio << sio.nl;
70} 209}
71 210*/
diff --git a/src/utfstring.h b/src/utfstring.h
index 6f85e93..79ef62e 100644
--- a/src/utfstring.h
+++ b/src/utfstring.h
@@ -9,9 +9,12 @@
9#define BU_UTF_STRING_H 9#define BU_UTF_STRING_H
10 10
11#include <stdint.h> 11#include <stdint.h>
12#include "bu/array.h"
12 13
13namespace Bu 14namespace Bu
14{ 15{
16 class String;
17
15 /** 18 /**
16 * UtfChar isn't actually a character, unicode specifies "code points" not 19 * UtfChar isn't actually a character, unicode specifies "code points" not
17 * characters. The main reason for this is that not all code points define 20 * characters. The main reason for this is that not all code points define
@@ -40,10 +43,23 @@ namespace Bu
40 UtfString( const Bu::String &sInput, Encoding eEnc=Utf8 ); 43 UtfString( const Bu::String &sInput, Encoding eEnc=Utf8 );
41 virtual ~UtfString(); 44 virtual ~UtfString();
42 45
43 static void debugUtf8( const Bu::String &sUtf8 ); 46 void append( UtfChar ch );
47
48 void set( const Bu::String &sInput, Encoding eEnc=Utf8 );
49 void setUtf8( const Bu::String &sInput );
50 void setUtf16( const Bu::String &sInput );
51// void setUtf16be( const Bu::String &sInput );
52// void setUtf16le( const Bu::String &sInput );
53
54 void debug();
55
56 UtfChar get( int iIndex );
57
58 private:
59 void append16( uint16_t i ) { aData.append( i ); }
44 60
45 private: 61 private:
46 uint16_t *pData; 62 Bu::Array<uint16_t> aData;
47 int iRawLen; 63 int iRawLen;
48 int iCharLen; 64 int iCharLen;
49 }; 65 };