From 789eaff64b6dcdf920eb3f5a5d64ab4f1f33aa05 Mon Sep 17 00:00:00 2001 From: Mike Buland Date: Wed, 28 Jun 2006 07:28:17 +0000 Subject: Entities now work in the xml processor the way they should, you can define your own, use the 5 builtin ones (gt, lt, apos, quot, amp), and even create your own. The parser now skips any text definition at the top, which is fine for most xml that you get these days. I think if we ever make the break to full compliance we'll need to make a new parser from scratch. --- src/unit/xml.cpp | 16 +++++ src/xmlreader.cpp | 194 ++++++++++++++++++++++++++++++++++++++++++++++++++---- src/xmlreader.h | 24 ++++++- 3 files changed, 220 insertions(+), 14 deletions(-) diff --git a/src/unit/xml.cpp b/src/unit/xml.cpp index 559b2f4..e4d779c 100644 --- a/src/unit/xml.cpp +++ b/src/unit/xml.cpp @@ -15,6 +15,10 @@ public: TEST_ADD( XmlCoreTestSuite::badXml01 ) TEST_ADD( XmlCoreTestSuite::badXml02 ) TEST_ADD( XmlCoreTestSuite::badXml03 ) + + TEST_ADD( XmlCoreTestSuite::entityBuiltin01 ) + + TEST_ADD( XmlCoreTestSuite::entityDoc01 ) } private: @@ -32,6 +36,18 @@ private: { TEST_THROWS( XmlStringReader r("><&'""); + TEST_ASSERT( strcmp( r.getRoot()->getContent(), "><&\'\"" ) == 0 ); + } + + void entityDoc01() + { + XmlStringReader r(""&name;""); + TEST_ASSERT( strcmp( r.getRoot()->getContent(), "\"bob the man\"" ) == 0 ); + } }; int main( int argc, char *argv[] ) diff --git a/src/xmlreader.cpp b/src/xmlreader.cpp index 70fd1d7..2a5f63f 100644 --- a/src/xmlreader.cpp +++ b/src/xmlreader.cpp @@ -1,14 +1,32 @@ #include "xmlreader.h" #include "xmlexception.h" #include +#include "hashfunctionstring.h" XmlReader::XmlReader( bool bStrip ) : - bStrip( bStrip ) + bStrip( bStrip ), + htEntity( new HashFunctionString(), 11 ) { } XmlReader::~XmlReader() { + void *i = htEntity.getFirstItemPos(); + while( (i = htEntity.getNextItemPos( i ) ) ) + { + free( (char *)(htEntity.getItemID( i )) ); + delete (StaticString *)htEntity.getItemData( i ); + } +} + +void XmlReader::addEntity( const char *name, const char *value ) +{ + if( htEntity[name] ) return; + + char *sName = strdup( name ); + StaticString *sValue = new StaticString( value ); + + htEntity.insert( sName, sValue ); } #define gcall( x ) if( x == false ) return false; @@ -39,11 +57,129 @@ bool XmlReader::buildDoc() { // take care of initial whitespace gcall( ws() ); + textDecl(); + entity(); + addEntity("gt", ">"); + addEntity("lt", "<"); + addEntity("amp", "&"); + addEntity("apos", "\'"); + addEntity("quot", "\""); gcall( node() ); return true; } +void XmlReader::textDecl() +{ + char chr; + if( getChar() == '<' && getChar( 1 ) == '?' ) + { + usedChar( 2 ); + for(;;) + { + if( getChar() == '?' ) + { + if( getChar( 1 ) == '>' ) + { + usedChar( 2 ); + return; + } + } + usedChar(); + } + } +} + +void XmlReader::entity() +{ + for(;;) + { + ws(); + + if( getChar() == '<' && getChar( 1 ) == '!' ) + { + usedChar( 2 ); + ws(); + std::string buf; + for(;;) + { + char chr = getChar(); + usedChar(); + if( isws( chr ) ) break; + buf += chr; + } + + if( strcmp( buf.c_str(), "ENTITY") == 0 ) + { + ws(); + std::string name; + for(;;) + { + char chr = getChar(); + usedChar(); + if( isws( chr ) ) break; + name += chr; + } + ws(); + char quot = getChar(); + usedChar(); + if( quot != '\'' && quot != '\"' ) + { + throw XmlException( + "Only quoted entity values are supported." + ); + } + std::string value; + for(;;) + { + char chr = getChar(); + usedChar(); + if( chr == '&' ) + { + StaticString *tmp = getEscape(); + if( tmp == NULL ) throw XmlException("Entity thing"); + value += tmp->getString(); + delete tmp; + } + else if( chr == quot ) + { + break; + } + else + { + value += chr; + } + } + ws(); + if( getChar() == '>' ) + { + usedChar(); + + addEntity( name.c_str(), value.c_str() ); + } + else + { + throw XmlException( + "Malformed ENTITY: unexpected '%c' found.", + getChar() + ); + } + } + else + { + throw XmlException( + "Unsupported header symbol: %s", + buf.c_str() + ); + } + } + else + { + return; + } + } +} + bool XmlReader::node() { gcall( startNode() ) @@ -190,13 +326,18 @@ bool XmlReader::paramlist() return true; } -char XmlReader::getEscape() +StaticString *XmlReader::getEscape() { - // Right now, we just do # escapes... if( getChar( 1 ) == '#' ) { - usedChar(); - usedChar(); + // If the entity starts with a # it's a character escape code + int base = 10; + usedChar( 2 ); + if( getChar() == 'x' ) + { + base = 16; + usedChar(); + } char buf[4]; int j = 0; for( j = 0; getChar() != ';'; j++ ) @@ -206,11 +347,29 @@ char XmlReader::getEscape() } usedChar(); buf[j] = '\0'; - return (char)atoi( buf ); + buf[0] = (char)strtol( buf, (char **)NULL, base ); + buf[1] = '\0'; + + return new StaticString( buf ); } else { - return '\0'; + // ...otherwise replace with the appropriate string... + std::string buf; + usedChar(); + for(;;) + { + char cbuf = getChar(); + usedChar(); + if( cbuf == ';' ) break; + buf += cbuf; + } + + StaticString *tmp = (StaticString *)htEntity[buf.c_str()]; + if( tmp == NULL ) return NULL; + + StaticString *ret = new StaticString( *tmp ); + return ret; } } @@ -260,9 +419,10 @@ bool XmlReader::param() { if( chr == '&' ) { - chr = getEscape(); - if( chr == '\0' ) return false; - fbValue.appendData( chr ); + StaticString *tmp = getEscape(); + if( tmp == NULL ) return false; + fbValue.appendData( tmp->getString() ); + delete tmp; } else { @@ -287,9 +447,10 @@ bool XmlReader::param() { if( chr == '&' ) { - chr = getEscape(); - if( chr == '\0' ) return false; - fbValue.appendData( chr ); + StaticString *tmp = getEscape(); + if( tmp == NULL ) return false; + fbValue.appendData( tmp->getString() ); + delete tmp; } else { @@ -425,6 +586,13 @@ bool XmlReader::content() if( bStrip ) gcall( ws() ); } + else if( chr == '&' ) + { + StaticString *tmp = getEscape(); + if( tmp == NULL ) return false; + fbContent.appendData( tmp->getString() ); + delete tmp; + } else { fbContent.appendData( chr ); diff --git a/src/xmlreader.h b/src/xmlreader.h index 4117dfd..a9881cb 100644 --- a/src/xmlreader.h +++ b/src/xmlreader.h @@ -4,6 +4,8 @@ #include #include "xmldocument.h" #include "flexbuf.h" +#include "hashtable.h" +#include "staticstring.h" /** * Takes care of reading in xml formatted data from a file. This could/should @@ -90,7 +92,25 @@ private: */ bool name(); - char getEscape(); + /** + * Automoton function: textDecl. Processes the xml text decleration, if + * there is one. + */ + void textDecl(); + + /** + * Automoton function: entity. Processes an entity from the header. + */ + void entity(); + + /** + * Adds an entity to the list, if it doesn't already exist. + *@param name The name of the entity + *@param value The value of the entity + */ + void addEntity( const char *name, const char *value ); + + StaticString *getEscape(); /** * Automoton function: paramlist. Processes a list of node params. @@ -114,6 +134,8 @@ private: FlexBuf fbParamName; /**< buffer for the current param's name. */ FlexBuf fbParamValue; /**< buffer for the current param's value. */ bool bStrip; /**< Are we stripping whitespace? */ + + HashTable htEntity; /**< Entity type definitions. */ }; #endif -- cgit v1.2.3