diff options
author | Mike Buland <eichlan@xagasoft.com> | 2011-03-22 19:25:42 +0000 |
---|---|---|
committer | Mike Buland <eichlan@xagasoft.com> | 2011-03-22 19:25:42 +0000 |
commit | 88004d87d513dcba767b1dae1e5199a89b22ce36 (patch) | |
tree | 06051330e18e44407edc25d28fe978e0637ed90e /src | |
parent | 9d7ee5a5b9b6ca2093043b7c584df02913739b02 (diff) | |
download | libbu++-88004d87d513dcba767b1dae1e5199a89b22ce36.tar.gz libbu++-88004d87d513dcba767b1dae1e5199a89b22ce36.tar.bz2 libbu++-88004d87d513dcba767b1dae1e5199a89b22ce36.tar.xz libbu++-88004d87d513dcba767b1dae1e5199a89b22ce36.zip |
We now have a UTF-8 test parser, I'm going to move it into a functor, I think.
Diffstat (limited to '')
-rw-r--r-- | src/tests/utf.cpp | 22 | ||||
-rw-r--r-- | src/utfstring.cpp | 54 | ||||
-rw-r--r-- | src/utfstring.h | 4 |
3 files changed, 80 insertions, 0 deletions
diff --git a/src/tests/utf.cpp b/src/tests/utf.cpp new file mode 100644 index 0000000..59d49c6 --- /dev/null +++ b/src/tests/utf.cpp | |||
@@ -0,0 +1,22 @@ | |||
1 | #include <bu/file.h> | ||
2 | #include <bu/string.h> | ||
3 | #include <bu/utfstring.h> | ||
4 | |||
5 | int main( int argc, char *argv[] ) | ||
6 | { | ||
7 | argc--, argv++; | ||
8 | |||
9 | for( char **sFile = argv; *sFile; sFile++ ) | ||
10 | { | ||
11 | Bu::File fIn( *sFile, Bu::File::Read ); | ||
12 | Bu::String sUtf8; | ||
13 | char buf[4096]; | ||
14 | while( !fIn.isEos() ) | ||
15 | { | ||
16 | int iAmnt = fIn.read( buf, 4096 ); | ||
17 | sUtf8.append( buf, iAmnt ); | ||
18 | } | ||
19 | Bu::UtfString::debugUtf8( sUtf8 ); | ||
20 | } | ||
21 | } | ||
22 | |||
diff --git a/src/utfstring.cpp b/src/utfstring.cpp index eb23713..0e2060b 100644 --- a/src/utfstring.cpp +++ b/src/utfstring.cpp | |||
@@ -7,6 +7,8 @@ | |||
7 | 7 | ||
8 | #include "bu/utfstring.h" | 8 | #include "bu/utfstring.h" |
9 | 9 | ||
10 | #include "bu/string.h" | ||
11 | |||
10 | Bu::UtfString::UtfString() | 12 | Bu::UtfString::UtfString() |
11 | { | 13 | { |
12 | } | 14 | } |
@@ -15,3 +17,55 @@ Bu::UtfString::~UtfString() | |||
15 | { | 17 | { |
16 | } | 18 | } |
17 | 19 | ||
20 | #include "bu/sio.h" | ||
21 | using Bu::sio; | ||
22 | |||
23 | void Bu::UtfString::debugUtf8( const Bu::String &sUtf8 ) | ||
24 | { | ||
25 | static uint8_t lmask[8] = { | ||
26 | 0x00, | ||
27 | 0x01, | ||
28 | 0x03, | ||
29 | 0x07, | ||
30 | 0x0f, | ||
31 | 0x1f, | ||
32 | 0x3f, | ||
33 | 0x7f | ||
34 | }; | ||
35 | for( Bu::String::const_iterator i = sUtf8.begin(); i; i++ ) | ||
36 | { | ||
37 | if( i != sUtf8.begin() ) | ||
38 | sio << ", "; | ||
39 | if( ((int)(uint8_t)*i)&0x80 ) | ||
40 | { | ||
41 | // sio << "Flag byte: " << Bu::Fmt().radix(2).width(8).fill('0') | ||
42 | // << (int)(uint8_t)*i << sio.nl; | ||
43 | int iBytes = 1; | ||
44 | for(; (((uint8_t)(*i))<<iBytes)&0x80; iBytes++ ) { } | ||
45 | // sio << "iBytes = " << iBytes << sio.nl; | ||
46 | point uPt = ((*i) & lmask[7-iBytes])<<(6*(iBytes-1)); | ||
47 | // sio << "mask: " << Bu::Fmt().radix(2).width(8).fill('0') | ||
48 | // << (int)lmask[7-iBytes] << sio.nl; | ||
49 | for( iBytes--; iBytes >= 1; iBytes-- ) | ||
50 | { | ||
51 | // sio << "iBytes = " << iBytes << ", shift = " << (6*(iBytes-1)) | ||
52 | // << sio.nl; | ||
53 | // sio << "next: " << Bu::Fmt().radix(2).width(8).fill('0') | ||
54 | // << (int)(uint8_t)*i << sio.nl | ||
55 | // << "mask: " << Bu::Fmt().radix(2).width(8).fill('0') | ||
56 | // << (int)lmask[6] << sio.nl; | ||
57 | i++; | ||
58 | uPt |= ((*i)&lmask[6])<<(6*(iBytes-1)); | ||
59 | } | ||
60 | sio << uPt; | ||
61 | // sio << " (" << Bu::Fmt( 8, 2 ).fill('0') | ||
62 | // << uPt << ")"; | ||
63 | } | ||
64 | else | ||
65 | { | ||
66 | sio << (int)((uint8_t)*i); | ||
67 | } | ||
68 | } | ||
69 | sio << sio.nl; | ||
70 | } | ||
71 | |||
diff --git a/src/utfstring.h b/src/utfstring.h index 56e544e..3bdf51c 100644 --- a/src/utfstring.h +++ b/src/utfstring.h | |||
@@ -12,6 +12,8 @@ | |||
12 | 12 | ||
13 | namespace Bu | 13 | namespace Bu |
14 | { | 14 | { |
15 | class String; | ||
16 | |||
15 | class UtfString | 17 | class UtfString |
16 | { | 18 | { |
17 | public: | 19 | public: |
@@ -20,6 +22,8 @@ namespace Bu | |||
20 | 22 | ||
21 | typedef uint32_t point; | 23 | typedef uint32_t point; |
22 | 24 | ||
25 | static void debugUtf8( const Bu::String &sUtf8 ); | ||
26 | |||
23 | private: | 27 | private: |
24 | // typedef BasicString<uint16_t> RawString; | 28 | // typedef BasicString<uint16_t> RawString; |
25 | // RawString rsStore; | 29 | // RawString rsStore; |