aboutsummaryrefslogtreecommitdiff
path: root/src/inprogress/xmlreader.h
blob: 708a3862f7044ad641f7a1cbcc0cef2f8a2adeac (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
#ifndef XML_READER_H
#define XML_READER_H

#include <stdint.h>
#include "bu/stream.h"
#include "bu/fstring.h"
#include "bu/xmlnode.h"

namespace Bu
{
	/**
	 * An Xml 1.1 reader.  I've decided to write this, this time, based on the
	 * official W3C reccomendation, now included with the source code.  I've
	 * named the productions in the parser states the same as in that document,
	 * which may make them easier to find, etc, although possibly slightly less
	 * optimized than writing my own reduced grammer.
	 *
	 * Below I will list differences between my parser and the official standard
	 * as I come up with them.
	 *  - Encoding and Standalone headings are ignored for the moment. (4.3.3,
	 *    2.9)
	 *  - The standalone heading attribute can have any standard whitespace
	 *    before it (the specs say only spaces, no newlines). (2.9)
	 *  - Since standalone is ignored, it is currently allowed to have any
	 *    value (should be restricted to "yes" or "no"). (2.9)
	 *  - Currently only UTF-8 / ascii are parsed.
	 *  - [optional] The content of comments is thrown away. (2.5)
	 *  - The content of processing instruction blocks is parsed properly, but
	 *    thrown away. (2.6)
	 */
	class XmlReader
	{
	public:
		XmlReader( Bu::Stream &sIn );
		virtual ~XmlReader();

		XmlNode *read();

	private:
		Bu::Stream &sIn;
		Bu::FString sBuf;

	private: // Helpers
		const char *lookahead( int nAmnt );
		void burn( int nAmnt );
		void checkString( const char *str, int nLen );

	private: // States
		/**
		 * The headers, etc.
		 */
		void prolog();

		/**
		 * The xml decleration (version, encoding, etc).
		 */
		void XMLDecl();

		/**
		 * Misc things, Includes Comments and PIData (Processing Instructions).
		 */
		void Misc();

		/**
		 * Comments
		 */
		void Comment();

		/**
		 * Processing Instructions
		 */
		void PI();

		/**
		 * Whitespace eater.
		 */
		void S();

		/**
		 * Optional whitespace eater.
		 */
		void Sq();

		/**
		 * XML Version spec
		 */
		void VersionInfo();

		/**
		 * Your basic equals sign with surrounding whitespace.
		 */
		void Eq();

		/**
		 * Read in an attribute value.
		 */
		FString AttValue();

		/**
		 * Read in the name of something.
		 */
		FString Name();

		/**
		 * Encoding decleration in the header
		 */
		void EncodingDecl();

		/**
		 * Standalone decleration in the header
		 */
		void SDDecl();

		bool isS( unsigned char c )
		{
			return ( c == 0x20 || c == 0x9 || c == 0xD || c == 0xA );
		}
	};
}

#endif