Code is all reorganized. We're about ready to release. I should write up a

little explenation of the arrangement.
author: Mike Buland <eichlan@xagasoft.com> 2012-03-25 20:00:08 +0000
committer: Mike Buland <eichlan@xagasoft.com> 2012-03-25 20:00:08 +0000
commit: 469bbcf0701e1eb8a6670c23145b0da87357e178 (patch)
tree: b5b062a16e46a6c5d3410b4e574cd0cc09057211 /src/unstable/utfstring.h
parent: ee1b79396076edc4e30aefb285fada03bb45e80d (diff)
download: libbu++-469bbcf0701e1eb8a6670c23145b0da87357e178.tar.gz
libbu++-469bbcf0701e1eb8a6670c23145b0da87357e178.tar.bz2
libbu++-469bbcf0701e1eb8a6670c23145b0da87357e178.tar.xz
libbu++-469bbcf0701e1eb8a6670c23145b0da87357e178.zip
1 files changed, 174 insertions, 0 deletions
diff --git a/src/unstable/utfstring.h b/src/unstable/utfstring.h
new file mode 100644
index 0000000..477e272
--- /dev/null
+++ b/src/unstable/utfstring.h
@@ -0,0 +1,174 @@
+/*
+ * Copyright (C) 2007-2011 Xagasoft, All rights reserved.
+ *
+ * This file is part of the libbu++ library and is released under the
+ * terms of the license contained in the file LICENSE.
+ */
+#ifndef BU_UTF_STRING_H
+#define BU_UTF_STRING_H
+#include <stdint.h>
+#include "bu/array.h"
+namespace Bu
+{
+        class String;
+        class Stream;
+        /**
+         * UtfChar isn't actually a character, unicode specifies "code points" not
+         * characters.  The main reason for this is that not all code points define
+         * usable characters.  Some control text directionality, some apply
+         * properties to other code points which are characters.  However, most of
+         * these distinctions are only important when implementing displays that
+         * comply with the Unicode standard fully.
+         */
+        typedef uint32_t UtfChar;
+        /**
+         * A unicode string.  This class represents a string of unicode code points.
+         * Every character in unicode can be represented with 21 bits, but we don't
+         * have a datatype that's 24 bits long, so we return all code points as a
+         * 32 bit unsigned value represented by Bu::UtfChar.  However, the UtfString
+         * class, for efficiency purposes doesn't store 32 bit values internally.
+         * It represents all code points in the native utf16 encodeng.  This means
+         * that it may be very difficult to quickly determine the length of a
+         * UtfString in code points.  Unlike many Unicode handling systems, this
+         * one actually works with complete code points.  When using this class you
+         * don't ever have to know about the inner workings of the different
+         * encoding schemes.  All of the data is dealt with as whole code points.
+         *
+         * As an aside, this means that when encoding a UtfString to a Utf16
+         * encoding that matches your archetecture this operation will be very
+         * fast since it will effectively be a raw dump of the internal data
+         * structures.  However, it is highly reccomended that you DO NOT use the
+         * little endian encodings if you can possibly avoid it.  They are not
+         * reccomended by the Unicode Consortium and are mainly supported as a
+         * means of communicating with other systems that encode their data
+         * incorrectly.  That said, whenever UtfString encodes the contained string
+         * it always includes a BOM at the begining (the byte order marker) so that
+         * proper byte order can be easily determined by the program reading the
+         * data.
+         *
+         *@todo Investigate http://www.unicode.org/reports/tr6/ for compression.
+         */
+        class UtfString
+        {
+        public:
+                enum Encoding
+                {
+                        Utf8,
+                        Utf16,
+                        Utf16be,
+                        Utf16le,
+                        Utf32,
+                        Utf32be,
+                        Utf32le,
+                        Ucs2,
+                        Ucs4,
+                        GuessEncoding
+                };
+                UtfString();
+                UtfString( const Bu::String &sInput, Encoding eEnc=Utf8 );
+                virtual ~UtfString();
+                class iterator
+                {
+                private:
+                        iterator( UtfString *pSrc, int iCodePos ) :
+                                pSrc( pSrc ), iCodePos( iCodePos )
+                        {
+                        }
+                public:
+                        iterator() :
+                                pSrc( NULL ), iCodePos( 0 )
+                        {
+                        }
+                        UtfChar operator*()
+                        {
+                                if( !pSrc )
+                                        throw Bu::ExceptionBase("invalid UtfString::iterator dereferenced.");
+                                return pSrc->nextChar( iCodePos );
+                        }
+                private:
+                        UtfString *pSrc;
+                        int iCodePos;
+                };
+                /**
+                 * Append a UtfChar (A unicode code point) to the string.  This can be
+                 * any valid code point, and is just the value of the code point, no
+                 * encoding necessary.
+                 */
+                void append( UtfChar ch );
+                /**
+                 * Set the value of the entire string based on the given input and
+                 * encoding.  The default encoding is Utf8, which is compatible with
+                 * 7-bit ascii, so it's a great choice for setting UtfStrings from
+                 * string literals in code.
+                 */
+                void set( const Bu::String &sInput, Encoding eEnc=Utf8 );
+                /**
+                 * This encodes the UtfString in the given encoding and outputs it to
+                 * the provided stream.  all Utf16 and Utf32 encodings will have the
+                 * correct BOM (byte order marker) at the begining.
+                 */
+                void write( Bu::Stream &sOut, Encoding eEnc=Utf8 );
+                /**
+                 * This encodes the UtfString in the given encoding and returns it as
+                 * a binary Bu::String.  Like write, this also includes the proper BOM
+                 * at the begining.
+                 */
+                Bu::String get( Encoding eEnc=Utf8 );
+                void debug();
+                /**
+                 * This may or may not stick around, given an index, this returns a
+                 * codepoint, however there isn't necesarilly a 1:1 ratio between
+                 * indexes and code points.
+                 */
+                UtfChar get( int iIndex );
+                /**
+                 * This is what to use if you want to iterate through a section of the
+                 * UtfString and you want to use a numerical index.  In most cases it
+                 * will be much easier to use an iterator, though.  Given an index this
+                 * will return the codepoint at that position and increment iIndex an
+                 * appropriate amount for it to point to the next code point.
+                 */
+                UtfChar nextChar( int &iIndex );
+        private:
+                void append16( uint16_t i ) { aData.append( i ); }
+                void setUtf8( const Bu::String &sInput );
+                void setUtf16( const Bu::String &sInput );
+                void setUtf16be( const Bu::String &sInput );
+                void setUtf16le( const Bu::String &sInput );
+                void setUtf32( const Bu::String &sInput );
+                void setUtf32be( const Bu::String &sInput );
+                void setUtf32le( const Bu::String &sInput );
+                
+                void writeUtf8( Bu::Stream &sOut );
+                void writeUtf16be( Bu::Stream &sOut );
+                void writeUtf16le( Bu::Stream &sOut );
+                void writeUtf32be( Bu::Stream &sOut );
+                void writeUtf32le( Bu::Stream &sOut );
+        private:
+                Bu::Array<uint16_t> aData;
+                int iRawLen;
+                int iCharLen;
+        };
+};
+#endif
author	Mike Buland <eichlan@xagasoft.com>	2012-03-25 20:00:08 +0000
committer	Mike Buland <eichlan@xagasoft.com>	2012-03-25 20:00:08 +0000
commit	469bbcf0701e1eb8a6670c23145b0da87357e178 (patch)
tree	b5b062a16e46a6c5d3410b4e574cd0cc09057211 /src/unstable/utfstring.h
parent	ee1b79396076edc4e30aefb285fada03bb45e80d (diff)
download	libbu++-469bbcf0701e1eb8a6670c23145b0da87357e178.tar.gz libbu++-469bbcf0701e1eb8a6670c23145b0da87357e178.tar.bz2 libbu++-469bbcf0701e1eb8a6670c23145b0da87357e178.tar.xz libbu++-469bbcf0701e1eb8a6670c23145b0da87357e178.zip

diff --git a/src/unstable/utfstring.h b/src/unstable/utfstring.h new file mode 100644 index 0000000..477e272 --- /dev/null +++ b/src/unstable/utfstring.h
@@ -0,0 +1,174 @@
	1	/*
	2	* Copyright (C) 2007-2011 Xagasoft, All rights reserved.
	3	*
	4	* This file is part of the libbu++ library and is released under the
	5	* terms of the license contained in the file LICENSE.
	6	*/
	7
	8	#ifndef BU_UTF_STRING_H
	9	#define BU_UTF_STRING_H
	10
	11	#include <stdint.h>
	12	#include "bu/array.h"
	13
	14	namespace Bu
	15	{
	16	class String;
	17	class Stream;
	18
	19	/**
	20	* UtfChar isn't actually a character, unicode specifies "code points" not
	21	* characters. The main reason for this is that not all code points define
	22	* usable characters. Some control text directionality, some apply
	23	* properties to other code points which are characters. However, most of
	24	* these distinctions are only important when implementing displays that
	25	* comply with the Unicode standard fully.
	26	*/
	27	typedef uint32_t UtfChar;
	28
	29	/**
	30	* A unicode string. This class represents a string of unicode code points.
	31	* Every character in unicode can be represented with 21 bits, but we don't
	32	* have a datatype that's 24 bits long, so we return all code points as a
	33	* 32 bit unsigned value represented by Bu::UtfChar. However, the UtfString
	34	* class, for efficiency purposes doesn't store 32 bit values internally.
	35	* It represents all code points in the native utf16 encodeng. This means
	36	* that it may be very difficult to quickly determine the length of a
	37	* UtfString in code points. Unlike many Unicode handling systems, this
	38	* one actually works with complete code points. When using this class you
	39	* don't ever have to know about the inner workings of the different
	40	* encoding schemes. All of the data is dealt with as whole code points.
	41	*
	42	* As an aside, this means that when encoding a UtfString to a Utf16
	43	* encoding that matches your archetecture this operation will be very
	44	* fast since it will effectively be a raw dump of the internal data
	45	* structures. However, it is highly reccomended that you DO NOT use the
	46	* little endian encodings if you can possibly avoid it. They are not
	47	* reccomended by the Unicode Consortium and are mainly supported as a
	48	* means of communicating with other systems that encode their data
	49	* incorrectly. That said, whenever UtfString encodes the contained string
	50	* it always includes a BOM at the begining (the byte order marker) so that
	51	* proper byte order can be easily determined by the program reading the
	52	* data.
	53	*
	54	*@todo Investigate http://www.unicode.org/reports/tr6/ for compression.
	55	*/
	56	class UtfString
	57	{
	58	public:
	59	enum Encoding
	60	{
	61	Utf8,
	62	Utf16,
	63	Utf16be,
	64	Utf16le,
	65	Utf32,
	66	Utf32be,
	67	Utf32le,
	68	Ucs2,
	69	Ucs4,
	70	GuessEncoding
	71	};
	72
	73	UtfString();
	74	UtfString( const Bu::String &sInput, Encoding eEnc=Utf8 );
	75	virtual ~UtfString();
	76
	77	class iterator
	78	{
	79	private:
	80	iterator( UtfString *pSrc, int iCodePos ) :
	81	pSrc( pSrc ), iCodePos( iCodePos )
	82	{
	83	}
	84
	85	public:
	86	iterator() :
	87	pSrc( NULL ), iCodePos( 0 )
	88	{
	89	}
	90
	91	UtfChar operator*()
	92	{
	93	if( !pSrc )
	94	throw Bu::ExceptionBase("invalid UtfString::iterator dereferenced.");
	95	return pSrc->nextChar( iCodePos );
	96	}
	97
	98	private:
	99	UtfString *pSrc;
	100	int iCodePos;
	101	};
	102
	103	/**
	104	* Append a UtfChar (A unicode code point) to the string. This can be
	105	* any valid code point, and is just the value of the code point, no
	106	* encoding necessary.
	107	*/
	108	void append( UtfChar ch );
	109
	110	/**
	111	* Set the value of the entire string based on the given input and
	112	* encoding. The default encoding is Utf8, which is compatible with
	113	* 7-bit ascii, so it's a great choice for setting UtfStrings from
	114	* string literals in code.
	115	*/
	116	void set( const Bu::String &sInput, Encoding eEnc=Utf8 );
	117
	118	/**
	119	* This encodes the UtfString in the given encoding and outputs it to
	120	* the provided stream. all Utf16 and Utf32 encodings will have the
	121	* correct BOM (byte order marker) at the begining.
	122	*/
	123	void write( Bu::Stream &sOut, Encoding eEnc=Utf8 );
	124
	125	/**
	126	* This encodes the UtfString in the given encoding and returns it as
	127	* a binary Bu::String. Like write, this also includes the proper BOM
	128	* at the begining.
	129	*/
	130	Bu::String get( Encoding eEnc=Utf8 );
	131
	132	void debug();
	133
	134	/**
	135	* This may or may not stick around, given an index, this returns a
	136	* codepoint, however there isn't necesarilly a 1:1 ratio between
	137	* indexes and code points.
	138	*/
	139	UtfChar get( int iIndex );
	140
	141	/**
	142	* This is what to use if you want to iterate through a section of the
	143	* UtfString and you want to use a numerical index. In most cases it
	144	* will be much easier to use an iterator, though. Given an index this
	145	* will return the codepoint at that position and increment iIndex an
	146	* appropriate amount for it to point to the next code point.
	147	*/
	148	UtfChar nextChar( int &iIndex );
	149
	150	private:
	151	void append16( uint16_t i ) { aData.append( i ); }
	152
	153	void setUtf8( const Bu::String &sInput );
	154	void setUtf16( const Bu::String &sInput );
	155	void setUtf16be( const Bu::String &sInput );
	156	void setUtf16le( const Bu::String &sInput );
	157	void setUtf32( const Bu::String &sInput );
	158	void setUtf32be( const Bu::String &sInput );
	159	void setUtf32le( const Bu::String &sInput );
	160
	161	void writeUtf8( Bu::Stream &sOut );
	162	void writeUtf16be( Bu::Stream &sOut );
	163	void writeUtf16le( Bu::Stream &sOut );
	164	void writeUtf32be( Bu::Stream &sOut );
	165	void writeUtf32le( Bu::Stream &sOut );
	166
	167	private:
	168	Bu::Array<uint16_t> aData;
	169	int iRawLen;
	170	int iCharLen;
	171	};
	172	};
	173
	174	#endif