Rearranged the API a bit.

author: Mike Buland <eichlan@xagasoft.com> 2011-04-08 00:51:24 +0000
committer: Mike Buland <eichlan@xagasoft.com> 2011-04-08 00:51:24 +0000
commit: bc6d543210a9df6f578229c6050371ced665fd69 (patch)
tree: cf8bd3ac5b204d40697089e56ac3cb1445d960b4 /src/utfstring.h
parent: 27aecbc60be6c80ce221f29c01f743de714faa63 (diff)
download: libbu++-bc6d543210a9df6f578229c6050371ced665fd69.tar.gz
libbu++-bc6d543210a9df6f578229c6050371ced665fd69.tar.bz2
libbu++-bc6d543210a9df6f578229c6050371ced665fd69.tar.xz
libbu++-bc6d543210a9df6f578229c6050371ced665fd69.zip
1 files changed, 75 insertions, 13 deletions
diff --git a/src/utfstring.h b/src/utfstring.h
index be3e6ad..477e272 100644
--- a/src/utfstring.h
+++ b/src/utfstring.h
@@ -26,6 +26,33 @@ namespace Bu
         */
        typedef uint32_t UtfChar;
+        /**
+         * A unicode string.  This class represents a string of unicode code points.
+         * Every character in unicode can be represented with 21 bits, but we don't
+         * have a datatype that's 24 bits long, so we return all code points as a
+         * 32 bit unsigned value represented by Bu::UtfChar.  However, the UtfString
+         * class, for efficiency purposes doesn't store 32 bit values internally.
+         * It represents all code points in the native utf16 encodeng.  This means
+         * that it may be very difficult to quickly determine the length of a
+         * UtfString in code points.  Unlike many Unicode handling systems, this
+         * one actually works with complete code points.  When using this class you
+         * don't ever have to know about the inner workings of the different
+         * encoding schemes.  All of the data is dealt with as whole code points.
+         *
+         * As an aside, this means that when encoding a UtfString to a Utf16
+         * encoding that matches your archetecture this operation will be very
+         * fast since it will effectively be a raw dump of the internal data
+         * structures.  However, it is highly reccomended that you DO NOT use the
+         * little endian encodings if you can possibly avoid it.  They are not
+         * reccomended by the Unicode Consortium and are mainly supported as a
+         * means of communicating with other systems that encode their data
+         * incorrectly.  That said, whenever UtfString encodes the contained string
+         * it always includes a BOM at the begining (the byte order marker) so that
+         * proper byte order can be easily determined by the program reading the
+         * data.
+         *
+         *@todo Investigate http://www.unicode.org/reports/tr6/ for compression.
+         */
        class UtfString
        {
        public:
@@ -73,9 +100,56 @@ namespace Bu
                        int iCodePos;
                };
+                /**
+                 * Append a UtfChar (A unicode code point) to the string.  This can be
+                 * any valid code point, and is just the value of the code point, no
+                 * encoding necessary.
+                 */
                void append( UtfChar ch );
+                /**
+                 * Set the value of the entire string based on the given input and
+                 * encoding.  The default encoding is Utf8, which is compatible with
+                 * 7-bit ascii, so it's a great choice for setting UtfStrings from
+                 * string literals in code.
+                 */
                void set( const Bu::String &sInput, Encoding eEnc=Utf8 );
+                /**
+                 * This encodes the UtfString in the given encoding and outputs it to
+                 * the provided stream.  all Utf16 and Utf32 encodings will have the
+                 * correct BOM (byte order marker) at the begining.
+                 */
+                void write( Bu::Stream &sOut, Encoding eEnc=Utf8 );
+                /**
+                 * This encodes the UtfString in the given encoding and returns it as
+                 * a binary Bu::String.  Like write, this also includes the proper BOM
+                 * at the begining.
+                 */
+                Bu::String get( Encoding eEnc=Utf8 );
+                void debug();
+                /**
+                 * This may or may not stick around, given an index, this returns a
+                 * codepoint, however there isn't necesarilly a 1:1 ratio between
+                 * indexes and code points.
+                 */
+                UtfChar get( int iIndex );
+                /**
+                 * This is what to use if you want to iterate through a section of the
+                 * UtfString and you want to use a numerical index.  In most cases it
+                 * will be much easier to use an iterator, though.  Given an index this
+                 * will return the codepoint at that position and increment iIndex an
+                 * appropriate amount for it to point to the next code point.
+                 */
+                UtfChar nextChar( int &iIndex );
+        private:
+                void append16( uint16_t i ) { aData.append( i ); }
                void setUtf8( const Bu::String &sInput );
                void setUtf16( const Bu::String &sInput );
                void setUtf16be( const Bu::String &sInput );
@@ -83,25 +157,13 @@ namespace Bu
                void setUtf32( const Bu::String &sInput );
                void setUtf32be( const Bu::String &sInput );
                void setUtf32le( const Bu::String &sInput );
+                
-                void write( Bu::Stream &sOut, Encoding eEnc=Utf8 );
                void writeUtf8( Bu::Stream &sOut );
                void writeUtf16be( Bu::Stream &sOut );
                void writeUtf16le( Bu::Stream &sOut );
                void writeUtf32be( Bu::Stream &sOut );
                void writeUtf32le( Bu::Stream &sOut );
-                Bu::String to( Encoding eEnc=Utf8 );
-                Bu::String toUtf8();
-                void debug();
-                UtfChar get( int iIndex );
-                UtfChar nextChar( int &iIndex );
-        private:
-                void append16( uint16_t i ) { aData.append( i ); }
        private:
                Bu::Array<uint16_t> aData;
                int iRawLen;
author	Mike Buland <eichlan@xagasoft.com>	2011-04-08 00:51:24 +0000
committer	Mike Buland <eichlan@xagasoft.com>	2011-04-08 00:51:24 +0000
commit	bc6d543210a9df6f578229c6050371ced665fd69 (patch)
tree	cf8bd3ac5b204d40697089e56ac3cb1445d960b4 /src/utfstring.h
parent	27aecbc60be6c80ce221f29c01f743de714faa63 (diff)
download	libbu++-bc6d543210a9df6f578229c6050371ced665fd69.tar.gz libbu++-bc6d543210a9df6f578229c6050371ced665fd69.tar.bz2 libbu++-bc6d543210a9df6f578229c6050371ced665fd69.tar.xz libbu++-bc6d543210a9df6f578229c6050371ced665fd69.zip

diff --git a/src/utfstring.h b/src/utfstring.h index be3e6ad..477e272 100644 --- a/src/utfstring.h +++ b/src/utfstring.h
@@ -26,6 +26,33 @@ namespace Bu
26	*/	26	*/
27	typedef uint32_t UtfChar;	27	typedef uint32_t UtfChar;
28		28
		29	/**
		30	* A unicode string. This class represents a string of unicode code points.
		31	* Every character in unicode can be represented with 21 bits, but we don't
		32	* have a datatype that's 24 bits long, so we return all code points as a
		33	* 32 bit unsigned value represented by Bu::UtfChar. However, the UtfString
		34	* class, for efficiency purposes doesn't store 32 bit values internally.
		35	* It represents all code points in the native utf16 encodeng. This means
		36	* that it may be very difficult to quickly determine the length of a
		37	* UtfString in code points. Unlike many Unicode handling systems, this
		38	* one actually works with complete code points. When using this class you
		39	* don't ever have to know about the inner workings of the different
		40	* encoding schemes. All of the data is dealt with as whole code points.
		41	*
		42	* As an aside, this means that when encoding a UtfString to a Utf16
		43	* encoding that matches your archetecture this operation will be very
		44	* fast since it will effectively be a raw dump of the internal data
		45	* structures. However, it is highly reccomended that you DO NOT use the
		46	* little endian encodings if you can possibly avoid it. They are not
		47	* reccomended by the Unicode Consortium and are mainly supported as a
		48	* means of communicating with other systems that encode their data
		49	* incorrectly. That said, whenever UtfString encodes the contained string
		50	* it always includes a BOM at the begining (the byte order marker) so that
		51	* proper byte order can be easily determined by the program reading the
		52	* data.
		53	*
		54	*@todo Investigate http://www.unicode.org/reports/tr6/ for compression.
		55	*/
29	class UtfString	56	class UtfString
30	{	57	{
31	public:	58	public:
@@ -73,9 +100,56 @@ namespace Bu
73	int iCodePos;	100	int iCodePos;
74	};	101	};
75		102
		103	/**
		104	* Append a UtfChar (A unicode code point) to the string. This can be
		105	* any valid code point, and is just the value of the code point, no
		106	* encoding necessary.
		107	*/
76	void append( UtfChar ch );	108	void append( UtfChar ch );
77		109
		110	/**
		111	* Set the value of the entire string based on the given input and
		112	* encoding. The default encoding is Utf8, which is compatible with
		113	* 7-bit ascii, so it's a great choice for setting UtfStrings from
		114	* string literals in code.
		115	*/
78	void set( const Bu::String &sInput, Encoding eEnc=Utf8 );	116	void set( const Bu::String &sInput, Encoding eEnc=Utf8 );
		117
		118	/**
		119	* This encodes the UtfString in the given encoding and outputs it to
		120	* the provided stream. all Utf16 and Utf32 encodings will have the
		121	* correct BOM (byte order marker) at the begining.
		122	*/
		123	void write( Bu::Stream &sOut, Encoding eEnc=Utf8 );
		124
		125	/**
		126	* This encodes the UtfString in the given encoding and returns it as
		127	* a binary Bu::String. Like write, this also includes the proper BOM
		128	* at the begining.
		129	*/
		130	Bu::String get( Encoding eEnc=Utf8 );
		131
		132	void debug();
		133
		134	/**
		135	* This may or may not stick around, given an index, this returns a
		136	* codepoint, however there isn't necesarilly a 1:1 ratio between
		137	* indexes and code points.
		138	*/
		139	UtfChar get( int iIndex );
		140
		141	/**
		142	* This is what to use if you want to iterate through a section of the
		143	* UtfString and you want to use a numerical index. In most cases it
		144	* will be much easier to use an iterator, though. Given an index this
		145	* will return the codepoint at that position and increment iIndex an
		146	* appropriate amount for it to point to the next code point.
		147	*/
		148	UtfChar nextChar( int &iIndex );
		149
		150	private:
		151	void append16( uint16_t i ) { aData.append( i ); }
		152
79	void setUtf8( const Bu::String &sInput );	153	void setUtf8( const Bu::String &sInput );
80	void setUtf16( const Bu::String &sInput );	154	void setUtf16( const Bu::String &sInput );
81	void setUtf16be( const Bu::String &sInput );	155	void setUtf16be( const Bu::String &sInput );
@@ -83,25 +157,13 @@ namespace Bu
83	void setUtf32( const Bu::String &sInput );	157	void setUtf32( const Bu::String &sInput );
84	void setUtf32be( const Bu::String &sInput );	158	void setUtf32be( const Bu::String &sInput );
85	void setUtf32le( const Bu::String &sInput );	159	void setUtf32le( const Bu::String &sInput );
86		160
87	void write( Bu::Stream &sOut, Encoding eEnc=Utf8 );
88	void writeUtf8( Bu::Stream &sOut );	161	void writeUtf8( Bu::Stream &sOut );
89	void writeUtf16be( Bu::Stream &sOut );	162	void writeUtf16be( Bu::Stream &sOut );
90	void writeUtf16le( Bu::Stream &sOut );	163	void writeUtf16le( Bu::Stream &sOut );
91	void writeUtf32be( Bu::Stream &sOut );	164	void writeUtf32be( Bu::Stream &sOut );
92	void writeUtf32le( Bu::Stream &sOut );	165	void writeUtf32le( Bu::Stream &sOut );
93		166
94	Bu::String to( Encoding eEnc=Utf8 );
95	Bu::String toUtf8();
96
97	void debug();
98
99	UtfChar get( int iIndex );
100	UtfChar nextChar( int &iIndex );
101
102	private:
103	void append16( uint16_t i ) { aData.append( i ); }
104
105	private:	167	private:
106	Bu::Array<uint16_t> aData;	168	Bu::Array<uint16_t> aData;
107	int iRawLen;	169	int iRawLen;