summaryrefslogtreecommitdiff
path: root/src/utfstring.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/utfstring.cpp')
-rw-r--r--src/utfstring.cpp143
1 files changed, 141 insertions, 2 deletions
diff --git a/src/utfstring.cpp b/src/utfstring.cpp
index 0e2060b..bb0a011 100644
--- a/src/utfstring.cpp
+++ b/src/utfstring.cpp
@@ -9,17 +9,156 @@
9 9
10#include "bu/string.h" 10#include "bu/string.h"
11 11
12#include <endian.h>
13
12Bu::UtfString::UtfString() 14Bu::UtfString::UtfString()
13{ 15{
14} 16}
15 17
18Bu::UtfString::UtfString( const Bu::String &sInput, Encoding eEnc )
19{
20 set( sInput, eEnc );
21}
22
16Bu::UtfString::~UtfString() 23Bu::UtfString::~UtfString()
17{ 24{
18} 25}
19 26
27void Bu::UtfString::set( const Bu::String &sInput, Encoding eEnc )
28{
29 switch( eEnc )
30 {
31 case Utf8:
32 setUtf8( sInput );
33 break;
34
35 case Utf16:
36 case Utf16be:
37 setUtf16( sInput );
38 break;
39
40 case Utf16le:
41 throw Bu::ExceptionBase("Utf16le not supported yet.");
42 break;
43
44 case Utf32:
45 throw Bu::ExceptionBase("Utf32 not supported yet.");
46 break;
47
48 case Ucs16:
49 throw Bu::ExceptionBase("Ucs16 not supported yet.");
50 break;
51
52 case GuessEncoding:
53 throw Bu::ExceptionBase("Guessing mode not supported yet.");
54 break;
55 }
56}
57
58void Bu::UtfString::append( UtfChar ch )
59{
60 if( ch >= 0x10000 )
61 {
62 ch -= 0x10000;
63 append16( ((ch>>10)&0x3FF)| 0xD800u );
64 append16( (ch&0x3FF)| 0xDC00u );
65 }
66 else
67 {
68 append16( (uint16_t)(ch) );
69 }
70}
71
72void Bu::UtfString::setUtf8( const Bu::String &sInput )
73{
74 static uint8_t lmask[8] = {
75 0x00,
76 0x01,
77 0x03,
78 0x07,
79 0x0f,
80 0x1f,
81 0x3f,
82 0x7f
83 };
84 for( Bu::String::const_iterator i = sInput.begin(); i; i++ )
85 {
86 if( ((int)(uint8_t)*i)&0x80 )
87 {
88 int iBytes = 1;
89 for(; (((uint8_t)(*i))<<iBytes)&0x80; iBytes++ ) { }
90 Bu::UtfChar uPt = ((*i) & lmask[7-iBytes])<<(6*(iBytes-1));
91 for( iBytes--; iBytes >= 1; iBytes-- )
92 {
93 i++;
94 uPt |= ((*i)&lmask[6])<<(6*(iBytes-1));
95 }
96 append( uPt );
97 }
98 else
99 {
100 append( (Bu::UtfChar)(*i) );
101 }
102 }
103}
104
105void Bu::UtfString::setUtf16( const Bu::String &sInput )
106{
107 uint16_t hi, lo;
108 for( Bu::String::const_iterator i = sInput.begin(); i; i++ )
109 {
110 hi = (((uint8_t)*i)<<8) | ((uint8_t)*(++i));
111 append16( hi );
112 if( (hi&0xD800u) == 0xD800u )
113 {
114 lo = (((uint8_t)*(++i))<<8) | ((uint8_t)*(++i));
115 append16( lo );
116 }
117 }
118}
119
20#include "bu/sio.h" 120#include "bu/sio.h"
21using Bu::sio; 121using Bu::sio;
22 122
123Bu::UtfChar Bu::UtfString::get( int iIndex )
124{
125 Bu::UtfChar i = aData[iIndex];
126 switch( i&0xFC00 )
127 {
128 case 0xD800:
129 sio << "(hi) ";
130 return (((i&0x3FF)<<10) | ((aData[iIndex+1]&0x3FF)))+0x10000;
131
132 case 0xDC00:
133 sio << "(lo) ";
134 return 0;
135
136 default:
137 sio << "(--) ";
138 return i&0xFC00;
139 }
140}
141
142void Bu::UtfString::debug()
143{
144 sio << "Raw Utf16: ";
145 for( int i = 0; i < aData.getSize(); i++ )
146 {
147 if( i > 0 )
148 sio << ", ";
149 sio << "0x" << Fmt::hex() << aData[i];
150 }
151 sio << sio.nl;
152 sio << "Code Points: ";
153 for( int i = 0; i < aData.getSize(); i++ )
154 {
155 if( i > 0 )
156 sio << ", ";
157 sio << "0x" << Fmt::hex() << get( i );
158 }
159 sio << sio.nl;
160}
161/*
23void Bu::UtfString::debugUtf8( const Bu::String &sUtf8 ) 162void Bu::UtfString::debugUtf8( const Bu::String &sUtf8 )
24{ 163{
25 static uint8_t lmask[8] = { 164 static uint8_t lmask[8] = {
@@ -43,7 +182,7 @@ void Bu::UtfString::debugUtf8( const Bu::String &sUtf8 )
43 int iBytes = 1; 182 int iBytes = 1;
44 for(; (((uint8_t)(*i))<<iBytes)&0x80; iBytes++ ) { } 183 for(; (((uint8_t)(*i))<<iBytes)&0x80; iBytes++ ) { }
45// sio << "iBytes = " << iBytes << sio.nl; 184// sio << "iBytes = " << iBytes << sio.nl;
46 point uPt = ((*i) & lmask[7-iBytes])<<(6*(iBytes-1)); 185 Bu::UtfChar uPt = ((*i) & lmask[7-iBytes])<<(6*(iBytes-1));
47// sio << "mask: " << Bu::Fmt().radix(2).width(8).fill('0') 186// sio << "mask: " << Bu::Fmt().radix(2).width(8).fill('0')
48// << (int)lmask[7-iBytes] << sio.nl; 187// << (int)lmask[7-iBytes] << sio.nl;
49 for( iBytes--; iBytes >= 1; iBytes-- ) 188 for( iBytes--; iBytes >= 1; iBytes-- )
@@ -68,4 +207,4 @@ void Bu::UtfString::debugUtf8( const Bu::String &sUtf8 )
68 } 207 }
69 sio << sio.nl; 208 sio << sio.nl;
70} 209}
71 210*/