UtfString is going really well. It can now parse Utf8, Utf16 (le,be), and

Utf32 (le,be). The internal storage seems to be working fine, although we do have a problem with random access, but at least we can tell which half of a surrogate pair we're on, so we can always rapidly determine the entire code point from any utf16 index that we're on. The only optomization that I'm not doing yet is reading in entire 16bit or 32bit words at a time and converting them from their byte order to native. There are a few potential issues with that, so we'll see. I added a couple of testing datafiles and a test program, I'll delete them all just as soon as it's verified to write correctly.
author: Mike Buland <eichlan@xagasoft.com> 2011-04-04 14:59:13 +0000
committer: Mike Buland <eichlan@xagasoft.com> 2011-04-04 14:59:13 +0000
commit: 6ff23ac5097f7d92ac8840c2ba17d1dbf1eb80a5 (patch)
tree: fc70404d66854bba713bff2350f5f69f43bd85bc /src/utfstring.cpp
parent: abbf45c1da7f3e3a542e6c6339a1bab31283f22e (diff)
download: libbu++-6ff23ac5097f7d92ac8840c2ba17d1dbf1eb80a5.tar.gz
libbu++-6ff23ac5097f7d92ac8840c2ba17d1dbf1eb80a5.tar.bz2
libbu++-6ff23ac5097f7d92ac8840c2ba17d1dbf1eb80a5.tar.xz
libbu++-6ff23ac5097f7d92ac8840c2ba17d1dbf1eb80a5.zip
1 files changed, 225 insertions, 15 deletions
diff --git a/src/utfstring.cpp b/src/utfstring.cpp
index bb0a011..7c4ba19 100644
--- a/src/utfstring.cpp
+++ b/src/utfstring.cpp
@@ -8,9 +8,13 @@
 #include "bu/utfstring.h"
 #include "bu/string.h"
+#include "bu/stream.h"
 #include <endian.h>
+#include "bu/sio.h"
+using Bu::sio;
 Bu::UtfString::UtfString()
 {
 }
@@ -33,20 +37,35 @@ void Bu::UtfString::set( const Bu::String &sInput, Encoding eEnc )
                        break;
                case Utf16:
-                case Utf16be:
                        setUtf16( sInput );
                        break;
+                case Utf16be:
+                        setUtf16be( sInput );
+                        break;
                case Utf16le:
-                        throw Bu::ExceptionBase("Utf16le not supported yet.");
+                        setUtf16le( sInput );
                        break;
                case Utf32:
-                        throw Bu::ExceptionBase("Utf32 not supported yet.");
+                        setUtf32( sInput );
+                        break;
+                case Utf32be:
+                        setUtf32be( sInput );
+                        break;
+                case Utf32le:
+                        setUtf32le( sInput );
+                        break;
+                case Ucs2:
+                        throw Bu::ExceptionBase("Ucs2 not supported yet.");
                        break;
-                case Ucs16:
+                case Ucs4:
-                        throw Bu::ExceptionBase("Ucs16 not supported yet.");
+                        throw Bu::ExceptionBase("Ucs4 not supported yet.");
                        break;
                case GuessEncoding:
@@ -104,8 +123,32 @@ void Bu::UtfString::setUtf8( const Bu::String &sInput )
 void Bu::UtfString::setUtf16( const Bu::String &sInput )
 {
+        Bu::String::const_iterator i = sInput.begin();
+        if( (uint8_t)*sInput.begin() == 0xFF &&
+                (uint8_t)*(sInput.begin()+1) == 0xFE )
+        {
+                setUtf16le( sInput );
+                return;
+        }
+        setUtf16be( sInput );
+}
+void Bu::UtfString::setUtf16be( const Bu::String &sInput )
+{
+        Bu::String::const_iterator i = sInput.begin();
+        if( (uint8_t)*sInput.begin() == 0xFE &&
+                (uint8_t)*(sInput.begin()+1) == 0xFF )
+        {
+                i += 2;
+                sio << "Verified big endian." << sio.nl;
+        }
+        else
+        {
+                sio << "Assuming big endian." << sio.nl;
+        }
        uint16_t hi, lo;
-        for( Bu::String::const_iterator i = sInput.begin(); i; i++ )
+        for( ; i; i++ )
        {
                hi = (((uint8_t)*i)<<8) | ((uint8_t)*(++i));
                append16( hi );
@@ -117,25 +160,192 @@ void Bu::UtfString::setUtf16( const Bu::String &sInput )
        }
 }
-#include "bu/sio.h"
+void Bu::UtfString::setUtf16le( const Bu::String &sInput )
-using Bu::sio;
+{
+        Bu::String::const_iterator i = sInput.begin();
+        if( (uint8_t)*sInput.begin() == 0xFF &&
+                (uint8_t)*(sInput.begin()+1) == 0xFE )
+        {
+                i += 2;
+                sio << "Verified little endian." << sio.nl;
+        }
+        else
+        {
+                sio << "Assuming little endian." << sio.nl;
+        }
+        uint16_t hi, lo;
+        for( ; i; i++ )
+        {
+                hi = (((uint8_t)*i)) | ((uint8_t)*(++i)<<8);
+                append16( hi );
+                if( (hi&0xD800u) == 0xD800u )
+                {
+                        lo = (((uint8_t)*(++i))) | ((uint8_t)*(++i)<<8);
+                        append16( lo );
+                }
+        }
+}
+void Bu::UtfString::setUtf32( const Bu::String &sInput )
+{
+        Bu::String::const_iterator i = sInput.begin();
+        if( (uint8_t)*i == 0x00 &&
+                (uint8_t)*(++i) == 0x00 &&
+                (uint8_t)*(++i) == 0xFF &&
+                (uint8_t)*(++i) == 0xFE )
+        {
+                setUtf32le( sInput );
+                return;
+        }
+        setUtf32be( sInput );
+}
+void Bu::UtfString::setUtf32be( const Bu::String &sInput )
+{
+        Bu::String::const_iterator i = sInput.begin();
+        if( (uint8_t)*i == 0x00 &&
+                (uint8_t)*(++i) == 0x00 &&
+                (uint8_t)*(++i) == 0xFE &&
+                (uint8_t)*(++i) == 0xFF )
+        {
+                i++;
+                sio << "Verified big endian." << sio.nl;
+        }
+        else
+        {
+                i = sInput.begin();
+                sio << "Assuming big endian." << sio.nl;
+        }
+        for( ; i; i++ )
+        {
+                append( (((uint8_t)*i)<<24) |
+                                (((uint8_t)*(++i))<<16) |
+                                (((uint8_t)*(++i))<<8) |
+                                ((uint8_t)*(++i))
+                          );
+        }
+}
+void Bu::UtfString::setUtf32le( const Bu::String &sInput )
+{
+        Bu::String::const_iterator i = sInput.begin();
+        if( (uint8_t)*i == 0x00 &&
+                (uint8_t)*(++i) == 0x00 &&
+                (uint8_t)*(++i) == 0xFF &&
+                (uint8_t)*(++i) == 0xFE )
+        {
+                i++;
+                sio << "Verified little endian." << sio.nl;
+        }
+        else
+        {
+                i = sInput.begin();
+                sio << "Assuming little endian." << sio.nl;
+        }
+        for( ; i; i++ )
+        {
+                append( ((uint8_t)*i) |
+                                (((uint8_t)*(++i))<<8) |
+                                (((uint8_t)*(++i))<<16) |
+                                (((uint8_t)*(++i))<<24)
+                          );
+        }
+}
+void Bu::UtfString::write( Bu::Stream &sOut, Encoding eEnc )
+{
+        switch( eEnc )
+        {
+                case Utf8:
+                        writeUtf8( sOut );
+                        break;
+                case Utf16:
+                        writeUtf16( sOut );
+                        break;
+                case Utf16be:
+                        writeUtf16be( sOut );
+                        break;
+                case Utf16le:
+                        writeUtf16le( sOut );
+                        break;
+                case Utf32:
+                        writeUtf32( sOut );
+                        break;
+                case Utf32be:
+                        writeUtf32be( sOut );
+                        break;
+                case Utf32le:
+                        writeUtf32le( sOut );
+                        break;
+                case Ucs2:
+                        throw Bu::ExceptionBase("Ucs2 not supported yet.");
+                        break;
+                case Ucs4:
+                        throw Bu::ExceptionBase("Ucs4 not supported yet.");
+                        break;
+                case GuessEncoding:
+                        throw Bu::ExceptionBase(
+                                "GuessEncoding is incompatible with encoding.");
+                        break;
+        }
+}
+void Bu::UtfString::writeUtf8( Bu::Stream &sOut )
+{
+}
+void Bu::UtfString::writeUtf16( Bu::Stream &sOut )
+{
+}
+void Bu::UtfString::writeUtf16be( Bu::Stream &sOut )
+{
+}
+void Bu::UtfString::writeUtf16le( Bu::Stream &sOut )
+{
+}
+void Bu::UtfString::writeUtf32( Bu::Stream &sOut )
+{
+}
+void Bu::UtfString::writeUtf32be( Bu::Stream &sOut )
+{
+}
+void Bu::UtfString::writeUtf32le( Bu::Stream &sOut )
+{
+}
 Bu::UtfChar Bu::UtfString::get( int iIndex )
 {
-        Bu::UtfChar i = aData[iIndex];
+        return nextChar( iIndex );
+}
+Bu::UtfChar Bu::UtfString::nextChar( int &iIndex )
+{
+        Bu::UtfChar i = aData[iIndex++];
        switch( i&0xFC00 )
        {
                case 0xD800:
-                        sio << "(hi) ";
+                        return (((i&0x3FF)<<10) | ((aData[iIndex++]&0x3FF)))+0x10000;
-                        return (((i&0x3FF)<<10) | ((aData[iIndex+1]&0x3FF)))+0x10000;
                case 0xDC00:
-                        sio << "(lo) ";
+                        return (((aData[iIndex-2]&0x3FF)<<10) | ((i&0x3FF)))+0x10000;
-                        return 0;
                default:
-                        sio << "(--) ";
+                        return i;
-                        return i&0xFC00;
        }
 }
author	Mike Buland <eichlan@xagasoft.com>	2011-04-04 14:59:13 +0000
committer	Mike Buland <eichlan@xagasoft.com>	2011-04-04 14:59:13 +0000
commit	6ff23ac5097f7d92ac8840c2ba17d1dbf1eb80a5 (patch)
tree	fc70404d66854bba713bff2350f5f69f43bd85bc /src/utfstring.cpp
parent	abbf45c1da7f3e3a542e6c6339a1bab31283f22e (diff)
download	libbu++-6ff23ac5097f7d92ac8840c2ba17d1dbf1eb80a5.tar.gz libbu++-6ff23ac5097f7d92ac8840c2ba17d1dbf1eb80a5.tar.bz2 libbu++-6ff23ac5097f7d92ac8840c2ba17d1dbf1eb80a5.tar.xz libbu++-6ff23ac5097f7d92ac8840c2ba17d1dbf1eb80a5.zip

diff --git a/src/utfstring.cpp b/src/utfstring.cpp index bb0a011..7c4ba19 100644 --- a/src/utfstring.cpp +++ b/src/utfstring.cpp
@@ -8,9 +8,13 @@
8	#include "bu/utfstring.h"	8	#include "bu/utfstring.h"
9		9
10	#include "bu/string.h"	10	#include "bu/string.h"
		11	#include "bu/stream.h"
11		12
12	#include <endian.h>	13	#include <endian.h>
13		14
		15	#include "bu/sio.h"
		16	using Bu::sio;
		17
14	Bu::UtfString::UtfString()	18	Bu::UtfString::UtfString()
15	{	19	{
16	}	20	}
@@ -33,20 +37,35 @@ void Bu::UtfString::set( const Bu::String &sInput, Encoding eEnc )
33	break;	37	break;
34		38
35	case Utf16:	39	case Utf16:
36	case Utf16be:
37	setUtf16( sInput );	40	setUtf16( sInput );
38	break;	41	break;
39		42
		43	case Utf16be:
		44	setUtf16be( sInput );
		45	break;
		46
40	case Utf16le:	47	case Utf16le:
41	throw Bu::ExceptionBase("Utf16le not supported yet.");	48	setUtf16le( sInput );
42	break;	49	break;
43		50
44	case Utf32:	51	case Utf32:
45	throw Bu::ExceptionBase("Utf32 not supported yet.");	52	setUtf32( sInput );
		53	break;
		54
		55	case Utf32be:
		56	setUtf32be( sInput );
		57	break;
		58
		59	case Utf32le:
		60	setUtf32le( sInput );
		61	break;
		62
		63	case Ucs2:
		64	throw Bu::ExceptionBase("Ucs2 not supported yet.");
46	break;	65	break;
47		66
48	case Ucs16:	67	case Ucs4:
49	throw Bu::ExceptionBase("Ucs16 not supported yet.");	68	throw Bu::ExceptionBase("Ucs4 not supported yet.");
50	break;	69	break;
51		70
52	case GuessEncoding:	71	case GuessEncoding:
@@ -104,8 +123,32 @@ void Bu::UtfString::setUtf8( const Bu::String &sInput )
104		123
105	void Bu::UtfString::setUtf16( const Bu::String &sInput )	124	void Bu::UtfString::setUtf16( const Bu::String &sInput )
106	{	125	{
		126	Bu::String::const_iterator i = sInput.begin();
		127	if( (uint8_t)*sInput.begin() == 0xFF &&
		128	(uint8_t)*(sInput.begin()+1) == 0xFE )
		129	{
		130	setUtf16le( sInput );
		131	return;
		132	}
		133	setUtf16be( sInput );
		134	}
		135
		136	void Bu::UtfString::setUtf16be( const Bu::String &sInput )
		137	{
		138	Bu::String::const_iterator i = sInput.begin();
		139	if( (uint8_t)*sInput.begin() == 0xFE &&
		140	(uint8_t)*(sInput.begin()+1) == 0xFF )
		141
		142	{
		143	i += 2;
		144	sio << "Verified big endian." << sio.nl;
		145	}
		146	else
		147	{
		148	sio << "Assuming big endian." << sio.nl;
		149	}
107	uint16_t hi, lo;	150	uint16_t hi, lo;
108	for( Bu::String::const_iterator i = sInput.begin(); i; i++ )	151	for( ; i; i++ )
109	{	152	{
110	hi = (((uint8_t)i)<<8) \| ((uint8_t)(++i));	153	hi = (((uint8_t)i)<<8) \| ((uint8_t)(++i));
111	append16( hi );	154	append16( hi );
@@ -117,25 +160,192 @@ void Bu::UtfString::setUtf16( const Bu::String &sInput )
117	}	160	}
118	}	161	}
119		162
120	#include "bu/sio.h"	163	void Bu::UtfString::setUtf16le( const Bu::String &sInput )
121	using Bu::sio;	164	{
		165	Bu::String::const_iterator i = sInput.begin();
		166	if( (uint8_t)*sInput.begin() == 0xFF &&
		167	(uint8_t)*(sInput.begin()+1) == 0xFE )
		168	{
		169	i += 2;
		170	sio << "Verified little endian." << sio.nl;
		171	}
		172	else
		173	{
		174	sio << "Assuming little endian." << sio.nl;
		175	}
		176	uint16_t hi, lo;
		177	for( ; i; i++ )
		178	{
		179	hi = (((uint8_t)i)) \| ((uint8_t)(++i)<<8);
		180	append16( hi );
		181	if( (hi&0xD800u) == 0xD800u )
		182	{
		183	lo = (((uint8_t)(++i))) \| ((uint8_t)(++i)<<8);
		184	append16( lo );
		185	}
		186	}
		187	}
		188
		189	void Bu::UtfString::setUtf32( const Bu::String &sInput )
		190	{
		191	Bu::String::const_iterator i = sInput.begin();
		192	if( (uint8_t)*i == 0x00 &&
		193	(uint8_t)*(++i) == 0x00 &&
		194	(uint8_t)*(++i) == 0xFF &&
		195	(uint8_t)*(++i) == 0xFE )
		196	{
		197	setUtf32le( sInput );
		198	return;
		199	}
		200	setUtf32be( sInput );
		201	}
		202
		203	void Bu::UtfString::setUtf32be( const Bu::String &sInput )
		204	{
		205	Bu::String::const_iterator i = sInput.begin();
		206	if( (uint8_t)*i == 0x00 &&
		207	(uint8_t)*(++i) == 0x00 &&
		208	(uint8_t)*(++i) == 0xFE &&
		209	(uint8_t)*(++i) == 0xFF )
		210	{
		211	i++;
		212	sio << "Verified big endian." << sio.nl;
		213	}
		214	else
		215	{
		216	i = sInput.begin();
		217	sio << "Assuming big endian." << sio.nl;
		218	}
		219	for( ; i; i++ )
		220	{
		221	append( (((uint8_t)*i)<<24) \|
		222	(((uint8_t)*(++i))<<16) \|
		223	(((uint8_t)*(++i))<<8) \|
		224	((uint8_t)*(++i))
		225	);
		226	}
		227	}
		228
		229	void Bu::UtfString::setUtf32le( const Bu::String &sInput )
		230	{
		231	Bu::String::const_iterator i = sInput.begin();
		232	if( (uint8_t)*i == 0x00 &&
		233	(uint8_t)*(++i) == 0x00 &&
		234	(uint8_t)*(++i) == 0xFF &&
		235	(uint8_t)*(++i) == 0xFE )
		236	{
		237	i++;
		238	sio << "Verified little endian." << sio.nl;
		239	}
		240	else
		241	{
		242	i = sInput.begin();
		243	sio << "Assuming little endian." << sio.nl;
		244	}
		245	for( ; i; i++ )
		246	{
		247	append( ((uint8_t)*i) \|
		248	(((uint8_t)*(++i))<<8) \|
		249	(((uint8_t)*(++i))<<16) \|
		250	(((uint8_t)*(++i))<<24)
		251	);
		252	}
		253	}
		254
		255	void Bu::UtfString::write( Bu::Stream &sOut, Encoding eEnc )
		256	{
		257	switch( eEnc )
		258	{
		259	case Utf8:
		260	writeUtf8( sOut );
		261	break;
		262
		263	case Utf16:
		264	writeUtf16( sOut );
		265	break;
		266
		267	case Utf16be:
		268	writeUtf16be( sOut );
		269	break;
		270
		271	case Utf16le:
		272	writeUtf16le( sOut );
		273	break;
		274
		275	case Utf32:
		276	writeUtf32( sOut );
		277	break;
		278
		279	case Utf32be:
		280	writeUtf32be( sOut );
		281	break;
		282
		283	case Utf32le:
		284	writeUtf32le( sOut );
		285	break;
		286
		287	case Ucs2:
		288	throw Bu::ExceptionBase("Ucs2 not supported yet.");
		289	break;
		290
		291	case Ucs4:
		292	throw Bu::ExceptionBase("Ucs4 not supported yet.");
		293	break;
		294
		295	case GuessEncoding:
		296	throw Bu::ExceptionBase(
		297	"GuessEncoding is incompatible with encoding.");
		298	break;
		299
		300	}
		301	}
		302
		303	void Bu::UtfString::writeUtf8( Bu::Stream &sOut )
		304	{
		305	}
		306
		307	void Bu::UtfString::writeUtf16( Bu::Stream &sOut )
		308	{
		309	}
		310
		311	void Bu::UtfString::writeUtf16be( Bu::Stream &sOut )
		312	{
		313	}
		314
		315	void Bu::UtfString::writeUtf16le( Bu::Stream &sOut )
		316	{
		317	}
		318
		319	void Bu::UtfString::writeUtf32( Bu::Stream &sOut )
		320	{
		321	}
		322
		323	void Bu::UtfString::writeUtf32be( Bu::Stream &sOut )
		324	{
		325	}
		326
		327	void Bu::UtfString::writeUtf32le( Bu::Stream &sOut )
		328	{
		329	}
122		330
123	Bu::UtfChar Bu::UtfString::get( int iIndex )	331	Bu::UtfChar Bu::UtfString::get( int iIndex )
124	{	332	{
125	Bu::UtfChar i = aData[iIndex];	333	return nextChar( iIndex );
		334	}
		335
		336	Bu::UtfChar Bu::UtfString::nextChar( int &iIndex )
		337	{
		338	Bu::UtfChar i = aData[iIndex++];
126	switch( i&0xFC00 )	339	switch( i&0xFC00 )
127	{	340	{
128	case 0xD800:	341	case 0xD800:
129	sio << "(hi) ";	342	return (((i&0x3FF)<<10) \| ((aData[iIndex++]&0x3FF)))+0x10000;
130	return (((i&0x3FF)<<10) \| ((aData[iIndex+1]&0x3FF)))+0x10000;
131		343
132	case 0xDC00:	344	case 0xDC00:
133	sio << "(lo) ";	345	return (((aData[iIndex-2]&0x3FF)<<10) \| ((i&0x3FF)))+0x10000;
134	return 0;
135		346
136	default:	347	default:
137	sio << "(--) ";	348	return i;
138	return i&0xFC00;
139	}	349	}
140	}	350	}
141		351