diff options
Diffstat (limited to 'src')
| -rw-r--r-- | src/config.h | 3 | ||||
| -rw-r--r-- | src/tests/utf.cpp | 3 | ||||
| -rw-r--r-- | src/utfstring.cpp | 143 | ||||
| -rw-r--r-- | src/utfstring.h | 20 |
4 files changed, 164 insertions, 5 deletions
diff --git a/src/config.h b/src/config.h index 3046b59..ce954de 100644 --- a/src/config.h +++ b/src/config.h | |||
| @@ -17,4 +17,7 @@ | |||
| 17 | 17 | ||
| 18 | #include "bu/extratypes.h" | 18 | #include "bu/extratypes.h" |
| 19 | 19 | ||
| 20 | // Later if we need autoconfig stuff, here's where it'll go. | ||
| 21 | // #include "bu/autoconfig.h" | ||
| 22 | |||
| 20 | #endif | 23 | #endif |
diff --git a/src/tests/utf.cpp b/src/tests/utf.cpp index 59d49c6..9e075e2 100644 --- a/src/tests/utf.cpp +++ b/src/tests/utf.cpp | |||
| @@ -16,7 +16,8 @@ int main( int argc, char *argv[] ) | |||
| 16 | int iAmnt = fIn.read( buf, 4096 ); | 16 | int iAmnt = fIn.read( buf, 4096 ); |
| 17 | sUtf8.append( buf, iAmnt ); | 17 | sUtf8.append( buf, iAmnt ); |
| 18 | } | 18 | } |
| 19 | Bu::UtfString::debugUtf8( sUtf8 ); | 19 | Bu::UtfString us( sUtf8, Bu::UtfString::Utf16 ); |
| 20 | us.debug(); | ||
| 20 | } | 21 | } |
| 21 | } | 22 | } |
| 22 | 23 | ||
diff --git a/src/utfstring.cpp b/src/utfstring.cpp index 0e2060b..bb0a011 100644 --- a/src/utfstring.cpp +++ b/src/utfstring.cpp | |||
| @@ -9,17 +9,156 @@ | |||
| 9 | 9 | ||
| 10 | #include "bu/string.h" | 10 | #include "bu/string.h" |
| 11 | 11 | ||
| 12 | #include <endian.h> | ||
| 13 | |||
| 12 | Bu::UtfString::UtfString() | 14 | Bu::UtfString::UtfString() |
| 13 | { | 15 | { |
| 14 | } | 16 | } |
| 15 | 17 | ||
| 18 | Bu::UtfString::UtfString( const Bu::String &sInput, Encoding eEnc ) | ||
| 19 | { | ||
| 20 | set( sInput, eEnc ); | ||
| 21 | } | ||
| 22 | |||
| 16 | Bu::UtfString::~UtfString() | 23 | Bu::UtfString::~UtfString() |
| 17 | { | 24 | { |
| 18 | } | 25 | } |
| 19 | 26 | ||
| 27 | void Bu::UtfString::set( const Bu::String &sInput, Encoding eEnc ) | ||
| 28 | { | ||
| 29 | switch( eEnc ) | ||
| 30 | { | ||
| 31 | case Utf8: | ||
| 32 | setUtf8( sInput ); | ||
| 33 | break; | ||
| 34 | |||
| 35 | case Utf16: | ||
| 36 | case Utf16be: | ||
| 37 | setUtf16( sInput ); | ||
| 38 | break; | ||
| 39 | |||
| 40 | case Utf16le: | ||
| 41 | throw Bu::ExceptionBase("Utf16le not supported yet."); | ||
| 42 | break; | ||
| 43 | |||
| 44 | case Utf32: | ||
| 45 | throw Bu::ExceptionBase("Utf32 not supported yet."); | ||
| 46 | break; | ||
| 47 | |||
| 48 | case Ucs16: | ||
| 49 | throw Bu::ExceptionBase("Ucs16 not supported yet."); | ||
| 50 | break; | ||
| 51 | |||
| 52 | case GuessEncoding: | ||
| 53 | throw Bu::ExceptionBase("Guessing mode not supported yet."); | ||
| 54 | break; | ||
| 55 | } | ||
| 56 | } | ||
| 57 | |||
| 58 | void Bu::UtfString::append( UtfChar ch ) | ||
| 59 | { | ||
| 60 | if( ch >= 0x10000 ) | ||
| 61 | { | ||
| 62 | ch -= 0x10000; | ||
| 63 | append16( ((ch>>10)&0x3FF)| 0xD800u ); | ||
| 64 | append16( (ch&0x3FF)| 0xDC00u ); | ||
| 65 | } | ||
| 66 | else | ||
| 67 | { | ||
| 68 | append16( (uint16_t)(ch) ); | ||
| 69 | } | ||
| 70 | } | ||
| 71 | |||
| 72 | void Bu::UtfString::setUtf8( const Bu::String &sInput ) | ||
| 73 | { | ||
| 74 | static uint8_t lmask[8] = { | ||
| 75 | 0x00, | ||
| 76 | 0x01, | ||
| 77 | 0x03, | ||
| 78 | 0x07, | ||
| 79 | 0x0f, | ||
| 80 | 0x1f, | ||
| 81 | 0x3f, | ||
| 82 | 0x7f | ||
| 83 | }; | ||
| 84 | for( Bu::String::const_iterator i = sInput.begin(); i; i++ ) | ||
| 85 | { | ||
| 86 | if( ((int)(uint8_t)*i)&0x80 ) | ||
| 87 | { | ||
| 88 | int iBytes = 1; | ||
| 89 | for(; (((uint8_t)(*i))<<iBytes)&0x80; iBytes++ ) { } | ||
| 90 | Bu::UtfChar uPt = ((*i) & lmask[7-iBytes])<<(6*(iBytes-1)); | ||
| 91 | for( iBytes--; iBytes >= 1; iBytes-- ) | ||
| 92 | { | ||
| 93 | i++; | ||
| 94 | uPt |= ((*i)&lmask[6])<<(6*(iBytes-1)); | ||
| 95 | } | ||
| 96 | append( uPt ); | ||
| 97 | } | ||
| 98 | else | ||
| 99 | { | ||
| 100 | append( (Bu::UtfChar)(*i) ); | ||
| 101 | } | ||
| 102 | } | ||
| 103 | } | ||
| 104 | |||
| 105 | void Bu::UtfString::setUtf16( const Bu::String &sInput ) | ||
| 106 | { | ||
| 107 | uint16_t hi, lo; | ||
| 108 | for( Bu::String::const_iterator i = sInput.begin(); i; i++ ) | ||
| 109 | { | ||
| 110 | hi = (((uint8_t)*i)<<8) | ((uint8_t)*(++i)); | ||
| 111 | append16( hi ); | ||
| 112 | if( (hi&0xD800u) == 0xD800u ) | ||
| 113 | { | ||
| 114 | lo = (((uint8_t)*(++i))<<8) | ((uint8_t)*(++i)); | ||
| 115 | append16( lo ); | ||
| 116 | } | ||
| 117 | } | ||
| 118 | } | ||
| 119 | |||
| 20 | #include "bu/sio.h" | 120 | #include "bu/sio.h" |
| 21 | using Bu::sio; | 121 | using Bu::sio; |
| 22 | 122 | ||
| 123 | Bu::UtfChar Bu::UtfString::get( int iIndex ) | ||
| 124 | { | ||
| 125 | Bu::UtfChar i = aData[iIndex]; | ||
| 126 | switch( i&0xFC00 ) | ||
| 127 | { | ||
| 128 | case 0xD800: | ||
| 129 | sio << "(hi) "; | ||
| 130 | return (((i&0x3FF)<<10) | ((aData[iIndex+1]&0x3FF)))+0x10000; | ||
| 131 | |||
| 132 | case 0xDC00: | ||
| 133 | sio << "(lo) "; | ||
| 134 | return 0; | ||
| 135 | |||
| 136 | default: | ||
| 137 | sio << "(--) "; | ||
| 138 | return i&0xFC00; | ||
| 139 | } | ||
| 140 | } | ||
| 141 | |||
| 142 | void Bu::UtfString::debug() | ||
| 143 | { | ||
| 144 | sio << "Raw Utf16: "; | ||
| 145 | for( int i = 0; i < aData.getSize(); i++ ) | ||
| 146 | { | ||
| 147 | if( i > 0 ) | ||
| 148 | sio << ", "; | ||
| 149 | sio << "0x" << Fmt::hex() << aData[i]; | ||
| 150 | } | ||
| 151 | sio << sio.nl; | ||
| 152 | sio << "Code Points: "; | ||
| 153 | for( int i = 0; i < aData.getSize(); i++ ) | ||
| 154 | { | ||
| 155 | if( i > 0 ) | ||
| 156 | sio << ", "; | ||
| 157 | sio << "0x" << Fmt::hex() << get( i ); | ||
| 158 | } | ||
| 159 | sio << sio.nl; | ||
| 160 | } | ||
| 161 | /* | ||
| 23 | void Bu::UtfString::debugUtf8( const Bu::String &sUtf8 ) | 162 | void Bu::UtfString::debugUtf8( const Bu::String &sUtf8 ) |
| 24 | { | 163 | { |
| 25 | static uint8_t lmask[8] = { | 164 | static uint8_t lmask[8] = { |
| @@ -43,7 +182,7 @@ void Bu::UtfString::debugUtf8( const Bu::String &sUtf8 ) | |||
| 43 | int iBytes = 1; | 182 | int iBytes = 1; |
| 44 | for(; (((uint8_t)(*i))<<iBytes)&0x80; iBytes++ ) { } | 183 | for(; (((uint8_t)(*i))<<iBytes)&0x80; iBytes++ ) { } |
| 45 | // sio << "iBytes = " << iBytes << sio.nl; | 184 | // sio << "iBytes = " << iBytes << sio.nl; |
| 46 | point uPt = ((*i) & lmask[7-iBytes])<<(6*(iBytes-1)); | 185 | Bu::UtfChar uPt = ((*i) & lmask[7-iBytes])<<(6*(iBytes-1)); |
| 47 | // sio << "mask: " << Bu::Fmt().radix(2).width(8).fill('0') | 186 | // sio << "mask: " << Bu::Fmt().radix(2).width(8).fill('0') |
| 48 | // << (int)lmask[7-iBytes] << sio.nl; | 187 | // << (int)lmask[7-iBytes] << sio.nl; |
| 49 | for( iBytes--; iBytes >= 1; iBytes-- ) | 188 | for( iBytes--; iBytes >= 1; iBytes-- ) |
| @@ -68,4 +207,4 @@ void Bu::UtfString::debugUtf8( const Bu::String &sUtf8 ) | |||
| 68 | } | 207 | } |
| 69 | sio << sio.nl; | 208 | sio << sio.nl; |
| 70 | } | 209 | } |
| 71 | 210 | */ | |
diff --git a/src/utfstring.h b/src/utfstring.h index 6f85e93..79ef62e 100644 --- a/src/utfstring.h +++ b/src/utfstring.h | |||
| @@ -9,9 +9,12 @@ | |||
| 9 | #define BU_UTF_STRING_H | 9 | #define BU_UTF_STRING_H |
| 10 | 10 | ||
| 11 | #include <stdint.h> | 11 | #include <stdint.h> |
| 12 | #include "bu/array.h" | ||
| 12 | 13 | ||
| 13 | namespace Bu | 14 | namespace Bu |
| 14 | { | 15 | { |
| 16 | class String; | ||
| 17 | |||
| 15 | /** | 18 | /** |
| 16 | * UtfChar isn't actually a character, unicode specifies "code points" not | 19 | * UtfChar isn't actually a character, unicode specifies "code points" not |
| 17 | * characters. The main reason for this is that not all code points define | 20 | * characters. The main reason for this is that not all code points define |
| @@ -40,10 +43,23 @@ namespace Bu | |||
| 40 | UtfString( const Bu::String &sInput, Encoding eEnc=Utf8 ); | 43 | UtfString( const Bu::String &sInput, Encoding eEnc=Utf8 ); |
| 41 | virtual ~UtfString(); | 44 | virtual ~UtfString(); |
| 42 | 45 | ||
| 43 | static void debugUtf8( const Bu::String &sUtf8 ); | 46 | void append( UtfChar ch ); |
| 47 | |||
| 48 | void set( const Bu::String &sInput, Encoding eEnc=Utf8 ); | ||
| 49 | void setUtf8( const Bu::String &sInput ); | ||
| 50 | void setUtf16( const Bu::String &sInput ); | ||
| 51 | // void setUtf16be( const Bu::String &sInput ); | ||
| 52 | // void setUtf16le( const Bu::String &sInput ); | ||
| 53 | |||
| 54 | void debug(); | ||
| 55 | |||
| 56 | UtfChar get( int iIndex ); | ||
| 57 | |||
| 58 | private: | ||
| 59 | void append16( uint16_t i ) { aData.append( i ); } | ||
| 44 | 60 | ||
| 45 | private: | 61 | private: |
| 46 | uint16_t *pData; | 62 | Bu::Array<uint16_t> aData; |
| 47 | int iRawLen; | 63 | int iRawLen; |
| 48 | int iCharLen; | 64 | int iCharLen; |
| 49 | }; | 65 | }; |
