diff options
| -rw-r--r-- | autoconfig.cpp | 30 | ||||
| -rw-r--r-- | default.bld | 37 | ||||
| -rw-r--r-- | src/config.h | 3 | ||||
| -rw-r--r-- | src/tests/utf.cpp | 3 | ||||
| -rw-r--r-- | src/utfstring.cpp | 143 | ||||
| -rw-r--r-- | src/utfstring.h | 20 |
6 files changed, 228 insertions, 8 deletions
diff --git a/autoconfig.cpp b/autoconfig.cpp new file mode 100644 index 0000000..aa8b6a4 --- /dev/null +++ b/autoconfig.cpp | |||
| @@ -0,0 +1,30 @@ | |||
| 1 | #include <stdint.h> | ||
| 2 | #include <stdio.h> | ||
| 3 | |||
| 4 | void detectEndianness() | ||
| 5 | { | ||
| 6 | uint16_t x=0x0100; | ||
| 7 | fprintf( stderr, | ||
| 8 | "#define LITTLE_ENDIAN 0\n" | ||
| 9 | "#define BIG_ENDIAN 1\n" | ||
| 10 | "#define ENDIANNESS %d\n\n", | ||
| 11 | ((uint8_t *)&x)[0] | ||
| 12 | ); | ||
| 13 | printf("Archetecture is: %s Endian\n", ((uint8_t *)&x)[0]?"Big":"Little" ); | ||
| 14 | } | ||
| 15 | |||
| 16 | int main() | ||
| 17 | { | ||
| 18 | fprintf( stderr, | ||
| 19 | "#ifndef BU_AUTO_CONFIG_H\n" | ||
| 20 | "#define BU_AUTO_CONFIG_H\n\n" | ||
| 21 | ); | ||
| 22 | |||
| 23 | // huh, turns out #include <endian.h> covers this... | ||
| 24 | // detectEndianness(); | ||
| 25 | |||
| 26 | fprintf( stderr, "#endif\n"); | ||
| 27 | |||
| 28 | return 0; | ||
| 29 | } | ||
| 30 | |||
diff --git a/default.bld b/default.bld index aa7f4cb..1aca56a 100644 --- a/default.bld +++ b/default.bld | |||
| @@ -15,7 +15,8 @@ CXXFLAGS += "-ggdb -W -Wall -I."; | |||
| 15 | 15 | ||
| 16 | action "default" | 16 | action "default" |
| 17 | { | 17 | { |
| 18 | build: [targets("header-links"), "libbu++.a", targets("tools")]; | 18 | build: [targets("header-links"), "libbu++.a", |
| 19 | targets("tools")]; | ||
| 19 | } | 20 | } |
| 20 | 21 | ||
| 21 | action "pkg" | 22 | action "pkg" |
| @@ -25,14 +26,44 @@ action "pkg" | |||
| 25 | 26 | ||
| 26 | action "all" | 27 | action "all" |
| 27 | { | 28 | { |
| 28 | build: [targets("header-links"), "libbu++.a", targets("tools"), | 29 | build: [targets("header-links"), "libbu++.a", |
| 29 | targets("tests")]; | 30 | targets("tools"), targets("tests")]; |
| 30 | } | 31 | } |
| 31 | 32 | ||
| 32 | action "unit" | 33 | action "unit" |
| 33 | { | 34 | { |
| 34 | build: targets("unit tests"); | 35 | build: targets("unit tests"); |
| 35 | } | 36 | } |
| 37 | /* | ||
| 38 | target "src/autoconfig.h" | ||
| 39 | { | ||
| 40 | input "autoconfig"; | ||
| 41 | display "autoconfig"; | ||
| 42 | profile "build" | ||
| 43 | { | ||
| 44 | execute("./autoconfig 2>src/autoconfig.h"); | ||
| 45 | } | ||
| 46 | } | ||
| 47 | |||
| 48 | target "bu/autoconfig.h" | ||
| 49 | { | ||
| 50 | tag "header-links"; | ||
| 51 | display "symlink"; | ||
| 52 | input "src/autoconfig.h"; | ||
| 53 | profile "build" | ||
| 54 | { | ||
| 55 | execute("echo ${INPUT}"); | ||
| 56 | execute("echo ${OUTPUT}"); | ||
| 57 | execute("mkdir -p $(dirname ${OUTPUT}); ln -s ../${INPUT} ${OUTPUT}"); | ||
| 58 | } | ||
| 59 | } | ||
| 60 | |||
| 61 | target "autoconfig" | ||
| 62 | { | ||
| 63 | rule "exe"; | ||
| 64 | input "autoconfig.cpp"; | ||
| 65 | } | ||
| 66 | */ | ||
| 36 | 67 | ||
| 37 | target files("src/*.h").replace("src/", "bu/") | 68 | target files("src/*.h").replace("src/", "bu/") |
| 38 | { | 69 | { |
diff --git a/src/config.h b/src/config.h index 3046b59..ce954de 100644 --- a/src/config.h +++ b/src/config.h | |||
| @@ -17,4 +17,7 @@ | |||
| 17 | 17 | ||
| 18 | #include "bu/extratypes.h" | 18 | #include "bu/extratypes.h" |
| 19 | 19 | ||
| 20 | // Later if we need autoconfig stuff, here's where it'll go. | ||
| 21 | // #include "bu/autoconfig.h" | ||
| 22 | |||
| 20 | #endif | 23 | #endif |
diff --git a/src/tests/utf.cpp b/src/tests/utf.cpp index 59d49c6..9e075e2 100644 --- a/src/tests/utf.cpp +++ b/src/tests/utf.cpp | |||
| @@ -16,7 +16,8 @@ int main( int argc, char *argv[] ) | |||
| 16 | int iAmnt = fIn.read( buf, 4096 ); | 16 | int iAmnt = fIn.read( buf, 4096 ); |
| 17 | sUtf8.append( buf, iAmnt ); | 17 | sUtf8.append( buf, iAmnt ); |
| 18 | } | 18 | } |
| 19 | Bu::UtfString::debugUtf8( sUtf8 ); | 19 | Bu::UtfString us( sUtf8, Bu::UtfString::Utf16 ); |
| 20 | us.debug(); | ||
| 20 | } | 21 | } |
| 21 | } | 22 | } |
| 22 | 23 | ||
diff --git a/src/utfstring.cpp b/src/utfstring.cpp index 0e2060b..bb0a011 100644 --- a/src/utfstring.cpp +++ b/src/utfstring.cpp | |||
| @@ -9,17 +9,156 @@ | |||
| 9 | 9 | ||
| 10 | #include "bu/string.h" | 10 | #include "bu/string.h" |
| 11 | 11 | ||
| 12 | #include <endian.h> | ||
| 13 | |||
| 12 | Bu::UtfString::UtfString() | 14 | Bu::UtfString::UtfString() |
| 13 | { | 15 | { |
| 14 | } | 16 | } |
| 15 | 17 | ||
| 18 | Bu::UtfString::UtfString( const Bu::String &sInput, Encoding eEnc ) | ||
| 19 | { | ||
| 20 | set( sInput, eEnc ); | ||
| 21 | } | ||
| 22 | |||
| 16 | Bu::UtfString::~UtfString() | 23 | Bu::UtfString::~UtfString() |
| 17 | { | 24 | { |
| 18 | } | 25 | } |
| 19 | 26 | ||
| 27 | void Bu::UtfString::set( const Bu::String &sInput, Encoding eEnc ) | ||
| 28 | { | ||
| 29 | switch( eEnc ) | ||
| 30 | { | ||
| 31 | case Utf8: | ||
| 32 | setUtf8( sInput ); | ||
| 33 | break; | ||
| 34 | |||
| 35 | case Utf16: | ||
| 36 | case Utf16be: | ||
| 37 | setUtf16( sInput ); | ||
| 38 | break; | ||
| 39 | |||
| 40 | case Utf16le: | ||
| 41 | throw Bu::ExceptionBase("Utf16le not supported yet."); | ||
| 42 | break; | ||
| 43 | |||
| 44 | case Utf32: | ||
| 45 | throw Bu::ExceptionBase("Utf32 not supported yet."); | ||
| 46 | break; | ||
| 47 | |||
| 48 | case Ucs16: | ||
| 49 | throw Bu::ExceptionBase("Ucs16 not supported yet."); | ||
| 50 | break; | ||
| 51 | |||
| 52 | case GuessEncoding: | ||
| 53 | throw Bu::ExceptionBase("Guessing mode not supported yet."); | ||
| 54 | break; | ||
| 55 | } | ||
| 56 | } | ||
| 57 | |||
| 58 | void Bu::UtfString::append( UtfChar ch ) | ||
| 59 | { | ||
| 60 | if( ch >= 0x10000 ) | ||
| 61 | { | ||
| 62 | ch -= 0x10000; | ||
| 63 | append16( ((ch>>10)&0x3FF)| 0xD800u ); | ||
| 64 | append16( (ch&0x3FF)| 0xDC00u ); | ||
| 65 | } | ||
| 66 | else | ||
| 67 | { | ||
| 68 | append16( (uint16_t)(ch) ); | ||
| 69 | } | ||
| 70 | } | ||
| 71 | |||
| 72 | void Bu::UtfString::setUtf8( const Bu::String &sInput ) | ||
| 73 | { | ||
| 74 | static uint8_t lmask[8] = { | ||
| 75 | 0x00, | ||
| 76 | 0x01, | ||
| 77 | 0x03, | ||
| 78 | 0x07, | ||
| 79 | 0x0f, | ||
| 80 | 0x1f, | ||
| 81 | 0x3f, | ||
| 82 | 0x7f | ||
| 83 | }; | ||
| 84 | for( Bu::String::const_iterator i = sInput.begin(); i; i++ ) | ||
| 85 | { | ||
| 86 | if( ((int)(uint8_t)*i)&0x80 ) | ||
| 87 | { | ||
| 88 | int iBytes = 1; | ||
| 89 | for(; (((uint8_t)(*i))<<iBytes)&0x80; iBytes++ ) { } | ||
| 90 | Bu::UtfChar uPt = ((*i) & lmask[7-iBytes])<<(6*(iBytes-1)); | ||
| 91 | for( iBytes--; iBytes >= 1; iBytes-- ) | ||
| 92 | { | ||
| 93 | i++; | ||
| 94 | uPt |= ((*i)&lmask[6])<<(6*(iBytes-1)); | ||
| 95 | } | ||
| 96 | append( uPt ); | ||
| 97 | } | ||
| 98 | else | ||
| 99 | { | ||
| 100 | append( (Bu::UtfChar)(*i) ); | ||
| 101 | } | ||
| 102 | } | ||
| 103 | } | ||
| 104 | |||
| 105 | void Bu::UtfString::setUtf16( const Bu::String &sInput ) | ||
| 106 | { | ||
| 107 | uint16_t hi, lo; | ||
| 108 | for( Bu::String::const_iterator i = sInput.begin(); i; i++ ) | ||
| 109 | { | ||
| 110 | hi = (((uint8_t)*i)<<8) | ((uint8_t)*(++i)); | ||
| 111 | append16( hi ); | ||
| 112 | if( (hi&0xD800u) == 0xD800u ) | ||
| 113 | { | ||
| 114 | lo = (((uint8_t)*(++i))<<8) | ((uint8_t)*(++i)); | ||
| 115 | append16( lo ); | ||
| 116 | } | ||
| 117 | } | ||
| 118 | } | ||
| 119 | |||
| 20 | #include "bu/sio.h" | 120 | #include "bu/sio.h" |
| 21 | using Bu::sio; | 121 | using Bu::sio; |
| 22 | 122 | ||
| 123 | Bu::UtfChar Bu::UtfString::get( int iIndex ) | ||
| 124 | { | ||
| 125 | Bu::UtfChar i = aData[iIndex]; | ||
| 126 | switch( i&0xFC00 ) | ||
| 127 | { | ||
| 128 | case 0xD800: | ||
| 129 | sio << "(hi) "; | ||
| 130 | return (((i&0x3FF)<<10) | ((aData[iIndex+1]&0x3FF)))+0x10000; | ||
| 131 | |||
| 132 | case 0xDC00: | ||
| 133 | sio << "(lo) "; | ||
| 134 | return 0; | ||
| 135 | |||
| 136 | default: | ||
| 137 | sio << "(--) "; | ||
| 138 | return i&0xFC00; | ||
| 139 | } | ||
| 140 | } | ||
| 141 | |||
| 142 | void Bu::UtfString::debug() | ||
| 143 | { | ||
| 144 | sio << "Raw Utf16: "; | ||
| 145 | for( int i = 0; i < aData.getSize(); i++ ) | ||
| 146 | { | ||
| 147 | if( i > 0 ) | ||
| 148 | sio << ", "; | ||
| 149 | sio << "0x" << Fmt::hex() << aData[i]; | ||
| 150 | } | ||
| 151 | sio << sio.nl; | ||
| 152 | sio << "Code Points: "; | ||
| 153 | for( int i = 0; i < aData.getSize(); i++ ) | ||
| 154 | { | ||
| 155 | if( i > 0 ) | ||
| 156 | sio << ", "; | ||
| 157 | sio << "0x" << Fmt::hex() << get( i ); | ||
| 158 | } | ||
| 159 | sio << sio.nl; | ||
| 160 | } | ||
| 161 | /* | ||
| 23 | void Bu::UtfString::debugUtf8( const Bu::String &sUtf8 ) | 162 | void Bu::UtfString::debugUtf8( const Bu::String &sUtf8 ) |
| 24 | { | 163 | { |
| 25 | static uint8_t lmask[8] = { | 164 | static uint8_t lmask[8] = { |
| @@ -43,7 +182,7 @@ void Bu::UtfString::debugUtf8( const Bu::String &sUtf8 ) | |||
| 43 | int iBytes = 1; | 182 | int iBytes = 1; |
| 44 | for(; (((uint8_t)(*i))<<iBytes)&0x80; iBytes++ ) { } | 183 | for(; (((uint8_t)(*i))<<iBytes)&0x80; iBytes++ ) { } |
| 45 | // sio << "iBytes = " << iBytes << sio.nl; | 184 | // sio << "iBytes = " << iBytes << sio.nl; |
| 46 | point uPt = ((*i) & lmask[7-iBytes])<<(6*(iBytes-1)); | 185 | Bu::UtfChar uPt = ((*i) & lmask[7-iBytes])<<(6*(iBytes-1)); |
| 47 | // sio << "mask: " << Bu::Fmt().radix(2).width(8).fill('0') | 186 | // sio << "mask: " << Bu::Fmt().radix(2).width(8).fill('0') |
| 48 | // << (int)lmask[7-iBytes] << sio.nl; | 187 | // << (int)lmask[7-iBytes] << sio.nl; |
| 49 | for( iBytes--; iBytes >= 1; iBytes-- ) | 188 | for( iBytes--; iBytes >= 1; iBytes-- ) |
| @@ -68,4 +207,4 @@ void Bu::UtfString::debugUtf8( const Bu::String &sUtf8 ) | |||
| 68 | } | 207 | } |
| 69 | sio << sio.nl; | 208 | sio << sio.nl; |
| 70 | } | 209 | } |
| 71 | 210 | */ | |
diff --git a/src/utfstring.h b/src/utfstring.h index 6f85e93..79ef62e 100644 --- a/src/utfstring.h +++ b/src/utfstring.h | |||
| @@ -9,9 +9,12 @@ | |||
| 9 | #define BU_UTF_STRING_H | 9 | #define BU_UTF_STRING_H |
| 10 | 10 | ||
| 11 | #include <stdint.h> | 11 | #include <stdint.h> |
| 12 | #include "bu/array.h" | ||
| 12 | 13 | ||
| 13 | namespace Bu | 14 | namespace Bu |
| 14 | { | 15 | { |
| 16 | class String; | ||
| 17 | |||
| 15 | /** | 18 | /** |
| 16 | * UtfChar isn't actually a character, unicode specifies "code points" not | 19 | * UtfChar isn't actually a character, unicode specifies "code points" not |
| 17 | * characters. The main reason for this is that not all code points define | 20 | * characters. The main reason for this is that not all code points define |
| @@ -40,10 +43,23 @@ namespace Bu | |||
| 40 | UtfString( const Bu::String &sInput, Encoding eEnc=Utf8 ); | 43 | UtfString( const Bu::String &sInput, Encoding eEnc=Utf8 ); |
| 41 | virtual ~UtfString(); | 44 | virtual ~UtfString(); |
| 42 | 45 | ||
| 43 | static void debugUtf8( const Bu::String &sUtf8 ); | 46 | void append( UtfChar ch ); |
| 47 | |||
| 48 | void set( const Bu::String &sInput, Encoding eEnc=Utf8 ); | ||
| 49 | void setUtf8( const Bu::String &sInput ); | ||
| 50 | void setUtf16( const Bu::String &sInput ); | ||
| 51 | // void setUtf16be( const Bu::String &sInput ); | ||
| 52 | // void setUtf16le( const Bu::String &sInput ); | ||
| 53 | |||
| 54 | void debug(); | ||
| 55 | |||
| 56 | UtfChar get( int iIndex ); | ||
| 57 | |||
| 58 | private: | ||
| 59 | void append16( uint16_t i ) { aData.append( i ); } | ||
| 44 | 60 | ||
| 45 | private: | 61 | private: |
| 46 | uint16_t *pData; | 62 | Bu::Array<uint16_t> aData; |
| 47 | int iRawLen; | 63 | int iRawLen; |
| 48 | int iCharLen; | 64 | int iCharLen; |
| 49 | }; | 65 | }; |
