diff options
-rw-r--r-- | misc/w3c-xml-1.1.html | 3673 | ||||
-rw-r--r-- | misc/xml-grammar | 7 | ||||
-rw-r--r-- | src/fstring.h | 21 | ||||
-rw-r--r-- | src/unit/xml.cpp | 39 | ||||
-rw-r--r-- | src/xmlreader.cpp | 165 | ||||
-rw-r--r-- | src/xmlreader.h | 54 |
6 files changed, 2360 insertions, 1599 deletions
diff --git a/misc/w3c-xml-1.1.html b/misc/w3c-xml-1.1.html index 6a9211a..89a502c 100644 --- a/misc/w3c-xml-1.1.html +++ b/misc/w3c-xml-1.1.html | |||
@@ -1,1598 +1,2075 @@ | |||
1 | <?xml version="1.0" encoding="UTF-8"?><!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"><html lang="EN" xmlns="http://www.w3.org/1999/xhtml"><head><meta http-equiv="Content-Type" content="text/html;charset=UTF-8" /><title>Extensible Markup Language (XML) 1.1</title><style type="text/css"> | 1 | <?xml version="1.0" encoding="UTF-8"?><!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"><html lang="en" xml:lang="en" xmlns="http://www.w3.org/1999/xhtml"><head><title>Extensible Markup Language (XML) 1.1 (Second Edition)</title><style type="text/css"> |
2 | code { font-family: monospace; } | 2 | code { font-family: monospace; } |
3 | 3 | ||
4 | div.constraint, | 4 | div.constraint, |
5 | div.issue, | 5 | div.issue, |
6 | div.note, | 6 | div.note, |
7 | div.notice { margin-left: 2em; } | 7 | div.notice { margin-left: 2em; } |
8 | 8 | ||
9 | li p { margin-top: 0.3em; | 9 | ol.enumar { list-style-type: decimal; } |
10 | margin-bottom: 0.3em; } | 10 | ol.enumla { list-style-type: lower-alpha; } |
11 | 11 | ol.enumlr { list-style-type: lower-roman; } | |
12 | div.exampleInner pre { margin-left: 1em; | 12 | ol.enumua { list-style-type: upper-alpha; } |
13 | margin-top: 0em; margin-bottom: 0em} | 13 | ol.enumur { list-style-type: upper-roman; } |
14 | div.exampleOuter {border: 4px double gray; | 14 | |
15 | margin: 0em; padding: 0em} | 15 | |
16 | div.exampleInner { background-color: #d5dee3; | 16 | div.exampleInner pre { margin-left: 1em; |
17 | border-top-width: 4px; | 17 | margin-top: 0em; margin-bottom: 0em} |
18 | border-top-style: double; | 18 | div.exampleOuter {border: 4px double gray; |
19 | border-top-color: #d3d3d3; | 19 | margin: 0em; padding: 0em} |
20 | border-bottom-width: 4px; | 20 | div.exampleInner { background-color: #d5dee3; |
21 | border-bottom-style: double; | 21 | border-top-width: 4px; |
22 | border-bottom-color: #d3d3d3; | 22 | border-top-style: double; |
23 | padding: 4px; margin: 0em } | 23 | border-top-color: #d3d3d3; |
24 | div.exampleWrapper { margin: 4px } | 24 | border-bottom-width: 4px; |
25 | div.exampleHeader { font-weight: bold; | 25 | border-bottom-style: double; |
26 | margin: 4px} | 26 | border-bottom-color: #d3d3d3; |
27 | 27 | padding: 4px; margin: 0em } | |
28 | em.rfc2119 { text-transform: lowercase; | 28 | div.exampleWrapper { margin: 4px } |
29 | font-variant: small-caps; | 29 | div.exampleHeader { font-weight: bold; |
30 | font-style: normal; } | 30 | margin: 4px} |
31 | </style><link rel="stylesheet" type="text/css" href="http://www.w3.org/StyleSheets/TR/W3C-REC.css" /></head><body><div class="head"><p><a href="http://www.w3.org/"><img src="http://www.w3.org/Icons/w3c_home" alt="W3C" height="48" width="72" /></a></p> | 31 | |
32 | <h1><a name="title" id="title" />Extensible Markup Language (XML) 1.1</h1> | 32 | em.rfc2119 { text-transform: lowercase; |
33 | <h2><a name="w3c-doctype" id="w3c-doctype" />W3C Recommendation 04 | 33 | font-variant: small-caps; |
34 | February 2004, edited in place 15 April 2004</h2><dl><dt>This version:</dt><dd><a href="http://www.w3.org/TR/2004/REC-xml11-20040204/">http://www.w3.org/TR/2004/REC-xml11-20040204/</a></dd><dt>Latest version:</dt><dd><a href="http://www.w3.org/TR/xml11">http://www.w3.org/TR/xml11</a></dd><dt>Previous version:</dt><dd><a href="http://www.w3.org/TR/2003/PR-xml11-20031105/">http://www.w3.org/TR/2003/PR-xml11-20031105/</a></dd><dt>Editors:</dt><dd>Tim Bray, Textuality and Netscape <a href="mailto:tbray@textuality.com"><tbray@textuality.com></a></dd><dd>Jean Paoli, Microsoft <a href="mailto:jeanpa@microsoft.com"><jeanpa@microsoft.com></a></dd><dd>C. M. Sperberg-McQueen, W3C <a href="mailto:cmsmcq@w3.org"><cmsmcq@w3.org></a></dd><dd>Eve Maler, Sun Microsystems, Inc. <a href="mailto:elm@east.sun.com"><eve.maler@east.sun.com></a></dd><dd>François Yergeau <a href="mailto:fyergeau@alis.com"><fyergeau@alis.com></a></dd><dd>John Cowan <a href="mailto:cowan@ccil.org"><cowan@ccil.org></a></dd></dl><p>Please refer to the <a href="http://www.w3.org/XML/xml-V11-1e-errata"><strong>errata</strong></a> for this document, which may include some normative corrections.</p><p>This document is also available in these non-normative formats: <a href="REC-xml11-20040204.xml">XML</a> and <a href="REC-xml11-20040204-review.html">XHTML with color-coded revision indicators</a>.</p><p>See also <a href="http://www.w3.org/2003/03/Translations/byTechnology?technology=xml11"><strong>translations</strong></a>.</p><p class="copyright"><a href="http://www.w3.org/Consortium/Legal/ipr-notice#Copyright">Copyright</a> © 2004 <a href="http://www.w3.org/"><acronym title="World Wide Web Consortium">W3C</acronym></a><sup>®</sup> (<a href="http://www.csail.mit.edu/"><acronym title="Massachusetts Institute of Technology">MIT</acronym></a>, <a href="http://www.ercim.org/"><acronym title="European Research Consortium for Informatics and Mathematics">ERCIM</acronym></a>, <a href="http://www.keio.ac.jp/">Keio</a>), All Rights Reserved. W3C <a href="http://www.w3.org/Consortium/Legal/ipr-notice#Legal_Disclaimer">liability</a>, <a href="http://www.w3.org/Consortium/Legal/ipr-notice#W3C_Trademarks">trademark</a>, <a href="http://www.w3.org/Consortium/Legal/copyright-documents">document use</a> and <a href="http://www.w3.org/Consortium/Legal/copyright-software">software licensing</a> rules apply.</p></div><hr /><div> <h2><a name="abstract" id="abstract" />Abstract</h2><p>The Extensible Markup Language (XML) is a subset of SGML that is completely | 34 | font-style: normal; } |
35 | described in this document. Its goal is to enable generic SGML to be served, | 35 | </style><link rel="stylesheet" type="text/css" href="http://www.w3.org/StyleSheets/TR/W3C-REC.css" /></head><body><div class="head"><p><a href="http://www.w3.org/"><img src="http://www.w3.org/Icons/w3c_home" alt="W3C" height="48" width="72" /></a></p> <h1><a name="title" id="title" />Extensible Markup Language (XML) 1.1 (Second Edition)</h1> <h2><a name="w3c-doctype" id="w3c-doctype" />W3C Recommendation 16 August 2006, edited in place 29 September 2006</h2><dl><dt>This version:</dt><dd> |
36 | received, and processed on the Web in the way that is now possible with HTML. | 36 | <a href="http://www.w3.org/TR/2006/REC-xml11-20060816">http://www.w3.org/TR/2006/REC-xml11-20060816</a> |
37 | XML has been designed for ease of implementation and for interoperability | 37 | </dd><dt>Latest version:</dt><dd> |
38 | with both SGML and HTML.</p></div><div> <h2><a name="status" id="status" />Status of this Document</h2><p><em>This section describes the status of this document at the time of its publication. Other documents may supersede this document. A list of current W3C publications and the latest revision of this technical report can be found in the <a href="http://www.w3.org/TR/">W3C technical reports index</a> at http://www.w3.org/TR/.</em></p><p>This document is a <a href="http://www.w3.org/2003/06/Process-20030618/tr.html#RecsW3C">Recommendation</a> of the W3C. | 38 | <a href="http://www.w3.org/TR/xml11">http://www.w3.org/TR/xml11</a> |
39 | It has been reviewed by W3C Members and other interested parties, and has | 39 | </dd><dt>Previous version:</dt><dd> |
40 | been endorsed by the Director as a W3C Recommendation. It is a stable document and may be used as reference material or cited as a normative reference from another document. W3C's role in making the | 40 | <a href="http://www.w3.org/TR/2006/PER-xml11-20060614">http://www.w3.org/TR/2006/PER-xml11-20060614</a> |
41 | Recommendation is to draw attention to the specification and to promote its widespread deployment. | 41 | </dd><dt>Editors:</dt><dd>Tim Bray, Textuality and Netscape <a href="mailto:tbray@textuality.com"><tbray@textuality.com></a></dd><dd>Jean Paoli, Microsoft <a href="mailto:jeanpa@microsoft.com"><jeanpa@microsoft.com></a></dd><dd>C. M. Sperberg-McQueen, W3C <a href="mailto:cmsmcq@w3.org"><cmsmcq@w3.org></a></dd><dd>Eve Maler, Sun Microsystems, Inc. <a href="mailto:elm@east.sun.com"><eve.maler@east.sun.com></a></dd><dd>François Yergeau</dd><dd>John Cowan <a href="mailto:cowan@ccil.org"><cowan@ccil.org></a></dd></dl><p>Please refer to the <a href="http://www.w3.org/XML/xml-V11-2e-errata"><strong>errata</strong></a> for this document, which may |
42 | This enhances the functionality and interoperability of the Web.</p><p>This document specifies a syntax created by subsetting an existing, widely | 42 | include some normative corrections.</p><p>The <a href="http://www.w3.org/XML/xml-V11-1e-errata">previous errata</a> for this document, are also available.</p><p>See also <a href="http://www.w3.org/2003/03/Translations/byTechnology?technology=xml11"><strong>translations</strong></a>.</p><p>This document is also available in these non-normative formats: <a href="REC-xml11-20060816.xml">XML</a> and <a href="REC-xml11-20060816-review.html">XHTML with color-coded revision indicators</a>.</p><p class="copyright"><a href="http://www.w3.org/Consortium/Legal/ipr-notice#Copyright">Copyright</a> © 2006 <a href="http://www.w3.org/"><acronym title="World Wide Web Consortium">W3C</acronym></a><sup>®</sup> (<a href="http://www.csail.mit.edu/"><acronym title="Massachusetts Institute of Technology">MIT</acronym></a>, <a href="http://www.ercim.org/"><acronym title="European Research Consortium for Informatics and Mathematics">ERCIM</acronym></a>, <a href="http://www.keio.ac.jp/">Keio</a>), All Rights Reserved. W3C <a href="http://www.w3.org/Consortium/Legal/ipr-notice#Legal_Disclaimer">liability</a>, <a href="http://www.w3.org/Consortium/Legal/ipr-notice#W3C_Trademarks">trademark</a> and <a href="http://www.w3.org/Consortium/Legal/copyright-documents">document use</a> rules apply.</p></div><hr/><div> <h2><a name="abstract" id="abstract"/>Abstract</h2><p>The Extensible Markup Language (XML) is a subset of SGML that is completely |
43 | used international text processing standard (Standard Generalized Markup Language, | 43 | described in this document. Its goal is to enable generic SGML to be served, |
44 | ISO 8879:1986(E) as amended and corrected) for use on the World Wide Web. | 44 | received, and processed on the Web in the way that is now possible with HTML. |
45 | It is a product of the <a | 45 | XML has been designed for ease of implementation and for interoperability |
46 | href="http://www.w3.org/XML/Activity.html">W3C XML | 46 | with both SGML and HTML.</p></div><div> <h2><a name="status" id="status"/>Status of this Document</h2><p><em>This section describes the status of this document at the time of its publication. |
47 | Activity</a>.</p> | 47 | Other documents may supersede this document. A list of current W3C publications and the |
48 | 48 | latest revision of this technical report can be found in the <a href="http://www.w3.org/TR/">W3C technical reports index</a> at | |
49 | <p>On 15 April 2004, this document was edited in place to add two | 49 | http://www.w3.org/TR/.</em></p><p>This document specifies a syntax created by subsetting an existing, widely |
50 | missing spaces to <a | 50 | used international text processing standard (Standard Generalized Markup Language, |
51 | href="http://www.w3.org/TR/2004/REC-xml11-20040204/Overview.html#NT-document">production | 51 | ISO 8879:1986(E) as amended and corrected) for use on the World Wide Web. |
52 | [1]</a> in section 2.1</p> | 52 | It is a product of the <a href="http://www.w3.org/XML/Core/">XML Core Working Group</a> |
53 | 53 | as part of the <a href="http://www.w3.org/XML/Activity">XML Activity</a>.</p> | |
54 | <p>The English version of this specification is the only normative version. However, | 54 | <p>On 29 September 2006 this document was edited in place to remove a |
55 | for translations of this document, see <a href="http://www.w3.org/2003/03/Translations/byTechnology?technology=xml11">http://www.w3.org/2003/03/Translations/byTechnology?technology=xml11</a>. | 55 | number of spurious and potentially misleading spaces.</p> |
56 | </p><p>Documentation of intellectual property possibly relevant to this recommendation | 56 | <p>The English version of this specification is the only normative version. However, |
57 | may be found at the Working Group's public | 57 | for translations of this document, see <a href="http://www.w3.org/2003/03/Translations/byTechnology?technology=xml11">http://www.w3.org/2003/03/Translations/byTechnology?technology=xml11</a>.</p><p>This document is a <a href="http://www.w3.org/2005/10/Process-20051014/tr.html#q74">W3C Recommendation</a>. |
58 | <a href="http://www.w3.org/2002/08/xmlcore-IPR-statements">IPR disclosure page</a>.</p><p>An implementation report for XML 1.1 is available at <a href="http://www.w3.org/XML/2002/09/xml11-implementation.html">http://www.w3.org/XML/2002/09/xml11-implementation.html</a>.</p><p>Please report errors in this document to <a href="mailto:xml-editor@w3.org">xml-editor@w3.org</a>; <a href="http://lists.w3.org/Archives/Public/xml-editor">archives</a> are available. The errata list for this edition is available | 58 | This second edition is <em>not</em> a new version of XML. As a convenience to readers, |
59 | at <a href="http://www.w3.org/XML/xml-V11-1e-errata">http://www.w3.org/XML/xml-V11-1e-errata</a>.</p><p>A <a href="http://www.w3.org/XML/Test/">Test Suite</a> is maintained to help assessing conformance to this specification.</p></div><div class="toc"> <h2><a name="contents" id="contents" />Table of Contents</h2><p class="toc">1 <a href="#sec-intro">Introduction</a><br /> 1.1 <a href="#sec-origin-goals">Origin and Goals</a><br /> 1.2 <a href="#sec-terminology">Terminology</a><br /> 1.3 <a href="#sec-xml11">Rationale and list of changes for XML 1.1</a><br /> 2 <a href="#sec-documents">Documents</a><br /> 2.1 <a href="#sec-well-formed">Well-Formed XML Documents</a><br /> 2.2 <a href="#charsets">Characters</a><br /> 2.3 <a href="#sec-common-syn">Common Syntactic Constructs</a><br /> 2.4 <a href="#syntax">Character Data and Markup</a><br /> 2.5 <a href="#sec-comments">Comments</a><br /> 2.6 <a href="#sec-pi">Processing Instructions</a><br /> 2.7 <a href="#sec-cdata-sect">CDATA Sections</a><br /> 2.8 <a href="#sec-prolog-dtd">Prolog and Document Type Declaration</a><br /> 2.9 <a href="#sec-rmd">Standalone Document Declaration</a><br /> 2.10 <a href="#sec-white-space">White Space Handling</a><br /> 2.11 <a href="#sec-line-ends">End-of-Line Handling</a><br /> 2.12 <a href="#sec-lang-tag">Language Identification</a><br /> 2.13 <a href="#sec-normalization-checking">Normalization Checking</a><br /> 3 <a href="#sec-logical-struct">Logical Structures</a><br /> 3.1 <a href="#sec-starttags">Start-Tags, End-Tags, and Empty-Element Tags</a><br /> 3.2 <a href="#elemdecls">Element Type Declarations</a><br /> 3.2.1 <a href="#sec-element-content">Element Content</a><br /> 3.2.2 <a href="#sec-mixed-content">Mixed Content</a><br /> 3.3 <a href="#attdecls">Attribute-List Declarations</a><br /> 3.3.1 <a href="#sec-attribute-types">Attribute Types</a><br /> 3.3.2 <a href="#sec-attr-defaults">Attribute Defaults</a><br /> 3.3.3 <a href="#AVNormalize">Attribute-Value Normalization</a><br /> 3.4 <a href="#sec-condition-sect">Conditional Sections</a><br /> 4 <a href="#sec-physical-struct">Physical Structures</a><br /> 4.1 <a href="#sec-references">Character and Entity References</a><br /> 4.2 <a href="#sec-entity-decl">Entity Declarations</a><br /> 4.2.1 <a href="#sec-internal-ent">Internal Entities</a><br /> 4.2.2 <a href="#sec-external-ent">External Entities</a><br /> 4.3 <a href="#TextEntities">Parsed Entities</a><br /> 4.3.1 <a href="#sec-TextDecl">The Text Declaration</a><br /> 4.3.2 <a href="#wf-entities">Well-Formed Parsed Entities</a><br /> 4.3.3 <a href="#charencoding">Character Encoding in Entities</a><br /> 4.3.4 <a href="#sec-version-info">Version Information in Entities</a><br /> 4.4 <a href="#entproc">XML Processor Treatment of Entities and References</a><br /> 4.4.1 <a href="#not-recognized">Not Recognized</a><br /> 4.4.2 <a href="#included">Included</a><br /> 4.4.3 <a href="#include-if-valid">Included If Validating</a><br /> 4.4.4 <a href="#forbidden">Forbidden</a><br /> 4.4.5 <a href="#inliteral">Included in Literal</a><br /> 4.4.6 <a href="#notify">Notify</a><br /> 4.4.7 <a href="#bypass">Bypassed</a><br /> 4.4.8 <a href="#as-PE">Included as PE</a><br /> 4.4.9 <a href="#error">Error</a><br /> 4.5 <a href="#intern-replacement">Construction of Entity Replacement Text</a><br /> 4.6 <a href="#sec-predefined-ent">Predefined Entities</a><br /> 4.7 <a href="#Notations">Notation Declarations</a><br /> 4.8 <a href="#sec-doc-entity">Document Entity</a><br /> 5 <a href="#sec-conformance">Conformance</a><br /> 5.1 <a href="#proc-types">Validating and Non-Validating Processors</a><br /> 5.2 <a href="#safe-behavior">Using XML Processors</a><br /> 6 <a href="#sec-notation">Notation</a><br /> </p> <h3><a name="appendices" id="appendices" />Appendices</h3><p class="toc">A <a href="#sec-bibliography">References</a><br /> A.1 <a href="#sec-existing-stds">Normative References</a><br /> A.2 <a href="#null">Other References</a><br /> B <a href="#sec-CharNorm">Definitions for Character Normalization</a><br /> C <a href="#sec-entexpand">Expansion of Entity and Character References</a> (Non-Normative)<br /> D <a href="#determinism">Deterministic Content Models</a> (Non-Normative)<br /> E <a href="#sec-guessing">Autodetection of Character Encodings</a> (Non-Normative)<br /> E.1 <a href="#sec-guessing-no-ext-info">Detection Without External Encoding Information</a><br /> E.2 <a href="#sec-guessing-with-ext-info">Priorities in the Presence of External Encoding Information</a><br /> F <a href="#sec-xml-wg">W3C XML Working Group</a> (Non-Normative)<br /> G <a href="#sec-core-wg">W3C XML Core Working Group</a> (Non-Normative)<br /> H <a href="#prod-notes">Production Notes</a> (Non-Normative)<br /> I <a href="#sec-suggested-names">Suggestions for XML Names</a> (Non-Normative)<br /> </p></div><hr /><div class="body"><div class="div1"> <h2><a name="sec-intro" id="sec-intro" />1 Introduction</h2><p>Extensible Markup Language, abbreviated XML, describes a class of data | 59 | it incorporates the changes dictated by the accumulated errata (available at |
60 | objects called <a title="XML Document" href="#dt-xml-doc">XML documents</a> and partially | 60 | <a href="http://www.w3.org/XML/xml-V11-1e-errata">http://www.w3.org/XML/xml-V11-1e-errata</a>) to the <a href="http://www.w3.org/TR/2004/REC-xml11-20040204/">First |
61 | describes the behavior of computer programs which process them. XML is an | 61 | Edition of XML 1.1, dated 4 February 2004</a>. In addition, the markup introduced to clarify when prescriptive |
62 | application profile or restricted form of SGML, the Standard Generalized Markup | 62 | keywords are used |
63 | Language <a href="#ISO8879">[ISO 8879]</a>. By construction, XML documents are conforming | 63 | in the formal sense defined in <a href="#rfc2119">[IETF RFC 2119]</a>, has been modified to better match the intent of <a href="#rfc2119">[IETF RFC 2119]</a>. |
64 | SGML documents.</p><p>XML documents are made up of storage units called <a title="Entity" href="#dt-entity">entities</a>, | 64 | This edition supersedes the previous <a href="http://www.w3.org/TR/2004/REC-xml11-20040204">W3C Recommendation |
65 | which contain either parsed or unparsed data. Parsed data is made up of <a title="Character" href="#dt-character">characters</a>, some of which form <a title="Character Data" href="#dt-chardata">character | 65 | of 4 February 2004</a>.</p><p>Please report errors in this document to the public <a href="mailto:xml-editor@w3.org">xml-editor@w3.org</a> mailing list; <a href="http://lists.w3.org/Archives/Public/xml-editor/">archives</a> are available. For the convenience of readers, |
66 | data</a>, and some of which form <a title="Markup" href="#dt-markup">markup</a>. | 66 | an <a href="REC-xml11-20060816-review.html">XHTML version with color-coded revision indicators</a> is |
67 | Markup encodes a description of the document's storage layout and logical | 67 | also provided; this version highlights each change due to an erratum published in the |
68 | structure. XML provides a mechanism to impose constraints on the storage layout | 68 | <a href="http://www.w3.org/XML/xml-V11-1e-errata">errata list</a>, together with a link to the particular |
69 | and logical structure.</p><p>[<a name="dt-xml-proc" id="dt-xml-proc" title="XML Processor">Definition</a>: A software module called | 69 | erratum in that list. Most of the errata in the list provide a rationale for the change. |
70 | an <b>XML processor</b> is used to read XML documents and provide access | 70 | The errata list for this second edition is available at <a href="http://www.w3.org/XML/xml-V11-2e-errata">http://www.w3.org/XML/xml-V11-2e-errata</a>.</p><p>An implementation report is available at <a href="http://www.w3.org/XML/2006/06/xml11-2e-implementation.html">http://www.w3.org/XML/2006/06/xml11-2e-implementation.html</a>. A <a href="http://www.w3.org/XML/Test/">Test Suite</a> is maintained to help assessing conformance to this specification.</p><p>This document has been reviewed by W3C Members, by software developers, and by other W3C groups and interested parties, and is endorsed by the Director as a W3C Recommendation. It is a stable document and may be used as reference material or cited from another document. W3C's role in making the Recommendation is to draw attention to the specification and to promote its widespread deployment. This enhances the functionality and interoperability of the Web.</p><p>This document is governed by the <a href="http://www.w3.org/TR/2002/NOTE-patent-practice-20020124">24 |
71 | to their content and structure.] [<a name="dt-app" id="dt-app" title="Application">Definition</a>: It | 71 | January 2002 CPP</a> as amended by the <a href="http://www.w3.org/2004/02/05-pp-transition">W3C Patent Policy |
72 | is assumed that an XML processor is doing its work on behalf of another module, | 72 | Transition Procedure</a>. W3C maintains a <a href="http://www.w3.org/2002/08/xmlcore-IPR-statements" rel="disclosure">public list of |
73 | called the <b>application</b>.] This specification describes | 73 | any patent disclosures</a> made in connection with the deliverables of |
74 | the required behavior of an XML processor in terms of how it must read XML | 74 | the group; that page also includes instructions for disclosing a patent. |
75 | data and the information it must provide to the application.</p><div class="div2"> <h3><a name="sec-origin-goals" id="sec-origin-goals" />1.1 Origin and Goals</h3><p>XML was developed by an XML Working Group (originally known as the SGML | 75 | An individual who has actual knowledge of a patent which the individual |
76 | Editorial Review Board) formed under the auspices of the World Wide Web Consortium | 76 | believes contains <a href="http://www.w3.org/Consortium/Patent-Policy-20040205/#def-essential">Essential |
77 | (W3C) in 1996. It was chaired by Jon Bosak of Sun Microsystems with the active | 77 | Claim(s)</a> must disclose the information in accordance with <a href="http://www.w3.org/Consortium/Patent-Policy-20040205/#sec-Disclosure">section |
78 | participation of an XML Special Interest Group (previously known as the SGML | 78 | 6 of the W3C Patent Policy</a>.</p></div><div class="toc"> <h2><a name="contents" id="contents"/>Table of Contents</h2><p class="toc">1 <a href="#sec-intro">Introduction</a><br/> 1.1 <a href="#sec-origin-goals">Origin and Goals</a><br/> 1.2 <a href="#sec-terminology">Terminology</a><br/> 1.3 <a href="#sec-xml11">Rationale and list of changes for XML 1.1</a><br/> 2 <a href="#sec-documents">Documents</a><br/> 2.1 <a href="#sec-well-formed">Well-Formed XML Documents</a><br/> 2.2 <a href="#charsets">Characters</a><br/> 2.3 <a href="#sec-common-syn">Common Syntactic Constructs</a><br/> 2.4 <a href="#syntax">Character Data and Markup</a><br/> 2.5 <a href="#sec-comments">Comments</a><br/> 2.6 <a href="#sec-pi">Processing Instructions</a><br/> 2.7 <a href="#sec-cdata-sect">CDATA Sections</a><br/> 2.8 <a href="#sec-prolog-dtd">Prolog and Document Type Declaration</a><br/> 2.9 <a href="#sec-rmd">Standalone Document Declaration</a><br/> 2.10 <a href="#sec-white-space">White Space Handling</a><br/> 2.11 <a href="#sec-line-ends">End-of-Line Handling</a><br/> 2.12 <a href="#sec-lang-tag">Language Identification</a><br/> 2.13 <a href="#sec-normalization-checking">Normalization Checking</a><br/> 3 <a href="#sec-logical-struct">Logical Structures</a><br/> 3.1 <a href="#sec-starttags">Start-Tags, End-Tags, and Empty-Element Tags</a><br/> 3.2 <a href="#elemdecls">Element Type Declarations</a><br/> 3.2.1 <a href="#sec-element-content">Element Content</a><br/> 3.2.2 <a href="#sec-mixed-content">Mixed Content</a><br/> 3.3 <a href="#attdecls">Attribute-List Declarations</a><br/> 3.3.1 <a href="#sec-attribute-types">Attribute Types</a><br/> 3.3.2 <a href="#sec-attr-defaults">Attribute Defaults</a><br/> 3.3.3 <a href="#AVNormalize">Attribute-Value Normalization</a><br/> 3.4 <a href="#sec-condition-sect">Conditional Sections</a><br/> 4 <a href="#sec-physical-struct">Physical Structures</a><br/> 4.1 <a href="#sec-references">Character and Entity References</a><br/> 4.2 <a href="#sec-entity-decl">Entity Declarations</a><br/> 4.2.1 <a href="#sec-internal-ent">Internal Entities</a><br/> 4.2.2 <a href="#sec-external-ent">External Entities</a><br/> 4.3 <a href="#TextEntities">Parsed Entities</a><br/> 4.3.1 <a href="#sec-TextDecl">The Text Declaration</a><br/> 4.3.2 <a href="#wf-entities">Well-Formed Parsed Entities</a><br/> 4.3.3 <a href="#charencoding">Character Encoding in Entities</a><br/> 4.3.4 <a href="#sec-version-info">Version Information in Entities</a><br/> 4.4 <a href="#entproc">XML Processor Treatment of Entities and References</a><br/> 4.4.1 <a href="#not-recognized">Not Recognized</a><br/> 4.4.2 <a href="#included">Included</a><br/> 4.4.3 <a href="#include-if-valid">Included If Validating</a><br/> 4.4.4 <a href="#forbidden">Forbidden</a><br/> 4.4.5 <a href="#inliteral">Included in Literal</a><br/> 4.4.6 <a href="#notify">Notify</a><br/> 4.4.7 <a href="#bypass">Bypassed</a><br/> 4.4.8 <a href="#as-PE">Included as PE</a><br/> 4.4.9 <a href="#error">Error</a><br/> 4.5 <a href="#intern-replacement">Construction of Entity Replacement Text</a><br/> 4.6 <a href="#sec-predefined-ent">Predefined Entities</a><br/> 4.7 <a href="#Notations">Notation Declarations</a><br/> 4.8 <a href="#sec-doc-entity">Document Entity</a><br/> 5 <a href="#sec-conformance">Conformance</a><br/> 5.1 <a href="#proc-types">Validating and Non-Validating Processors</a><br/> 5.2 <a href="#safe-behavior">Using XML Processors</a><br/> 6 <a href="#sec-notation">Notation</a><br/> </p> <h3><a name="appendices" id="appendices"/>Appendices</h3><p class="toc">A <a href="#sec-bibliography">References</a><br/> A.1 <a href="#sec-existing-stds">Normative References</a><br/> A.2 <a href="#null">Other References</a><br/> B <a href="#sec-CharNorm">Definitions for Character Normalization</a><br/> C <a href="#sec-entexpand">Expansion of Entity and Character References</a> (Non-Normative)<br/> D <a href="#determinism">Deterministic Content Models</a> (Non-Normative)<br/> E <a href="#sec-guessing">Autodetection of Character Encodings</a> (Non-Normative)<br/> E.1 <a href="#sec-guessing-no-ext-info">Detection Without External Encoding Information</a><br/> E.2 <a href="#sec-guessing-with-ext-info">Priorities in the Presence of External Encoding Information</a><br/> F <a href="#sec-xml-wg">W3C XML Working Group</a> (Non-Normative)<br/> G <a href="#sec-core-wg">W3C XML Core Working Group</a> (Non-Normative)<br/> H <a href="#prod-notes">Production Notes</a> (Non-Normative)<br/> I <a href="#sec-suggested-names">Suggestions for XML Names</a> (Non-Normative)<br/> </p></div><hr/><div class="body"><div class="div1"> <h2><a name="sec-intro" id="sec-intro"/>1 Introduction</h2><p>Extensible Markup Language, abbreviated XML, describes a class of data |
79 | Working Group) also organized by the W3C. The membership of the XML Working | 79 | objects called <a title="XML Document" href="#dt-xml-doc">XML documents</a> and partially |
80 | Group is given in an appendix. Dan Connolly served as the Working Group's contact with | 80 | describes the behavior of computer programs which process them. XML is an |
81 | the W3C.</p><p>The design goals for XML are:</p><ol type="1"><li><p>XML shall be straightforwardly usable over the Internet.</p></li><li><p>XML shall support a wide variety of applications.</p></li><li><p>XML shall be compatible with SGML.</p></li><li><p>It shall be easy to write programs which process XML documents.</p></li><li><p>The number of optional features in XML is to be kept to the absolute | 81 | application profile or restricted form of SGML, the Standard Generalized Markup |
82 | minimum, ideally zero.</p></li><li><p>XML documents should be human-legible and reasonably clear.</p></li><li><p>The XML design should be prepared quickly.</p></li><li><p>The design of XML shall be formal and concise.</p></li><li><p>XML documents shall be easy to create.</p></li><li><p>Terseness in XML markup is of minimal importance.</p></li></ol><p>This specification, together with associated standards (Unicode | 82 | Language <a href="#ISO8879">[ISO 8879]</a>. By construction, XML documents are conforming |
83 | <a href="#Unicode">[Unicode]</a> and ISO/IEC 10646 <a href="#ISO10646">[ISO/IEC 10646]</a> | 83 | SGML documents.</p><p>XML documents are made up of storage units called <a title="Entity" href="#dt-entity">entities</a>, |
84 | for characters, Internet RFC 3066 <a href="#RFC1766">[IETF RFC 3066]</a> for | 84 | which contain either parsed or unparsed data. Parsed data is made up of <a title="Character" href="#dt-character">characters</a>, some of which form <a title="Character Data" href="#dt-chardata">character |
85 | language identification tags, ISO 639 <a href="#ISO639">[ISO 639]</a> | 85 | data</a>, and some of which form <a title="Markup" href="#dt-markup">markup</a>. |
86 | for language name codes, and ISO 3166 <a href="#ISO3166">[ISO 3166]</a> for | 86 | Markup encodes a description of the document's storage layout and logical |
87 | country name codes), provides all the information necessary to | 87 | structure. XML provides a mechanism to impose constraints on the storage layout |
88 | understand XML Version 1.1 and construct computer | 88 | and logical structure.</p><p> |
89 | programs to process it.</p><p>This version of the XML specification may be distributed freely, as long as | 89 | [<a name="dt-xml-proc" id="dt-xml-proc" title="XML Processor">Definition</a>: A software module called |
90 | all text and legal notices remain intact.</p></div><div class="div2"> <h3><a name="sec-terminology" id="sec-terminology" />1.2 Terminology</h3><p>The terminology used to describe XML documents is defined in the body of | 90 | an <b>XML processor</b> is used to read XML documents and provide access |
91 | this specification. <span class="mustard">The key words <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em>, <em class="rfc2119" title="Keyword in RFC 2119 context">MUST NOT</em>, <em class="rfc2119" title="Keyword in RFC 2119 context">REQUIRED</em>, <em class="rfc2119" title="Keyword in RFC 2119 context">SHALL</em>, <em class="rfc2119" title="Keyword in RFC 2119 context">SHALL NOT</em>, <em class="rfc2119" title="Keyword in RFC 2119 context">SHOULD</em>, <em class="rfc2119" title="Keyword in RFC 2119 context">SHOULD NOT</em>, <em class="rfc2119" title="Keyword in RFC 2119 context">RECOMMENDED</em>, <em class="rfc2119" title="Keyword in RFC 2119 context">MAY</em>, and <em class="rfc2119" title="Keyword in RFC 2119 context">OPTIONAL</em>, when <em class="rfc2119" title="Keyword in RFC 2119 context">EMPHASIZED</em>, are to be interpreted as described in <a href="#rfc2119">[IETF RFC 2119]</a>. In addition, </span>the terms defined in the following list are used in building | 91 | to their content and structure.] |
92 | those definitions and in describing the actions of an XML processor:</p><dl><dt class="label">error</dt><dd><p>[<a name="dt-error" id="dt-error" title="Error">Definition</a>: A violation of the rules of this specification; | 92 | [<a name="dt-app" id="dt-app" title="Application">Definition</a>: It |
93 | results are undefined. <span class="mustard">Unless otherwise specified, failure to observe a prescription of this specification indicated by one of the keywords <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em>, <em class="rfc2119" title="Keyword in RFC 2119 context">REQUIRED</em>, <em class="rfc2119" title="Keyword in RFC 2119 context">MUST NOT</em>, <em class="rfc2119" title="Keyword in RFC 2119 context">SHALL</em> and <em class="rfc2119" title="Keyword in RFC 2119 context">SHALL NOT</em> is an error.</span> Conforming software <em class="rfc2119" title="Keyword in RFC 2119 context">MAY</em> detect and report an error | 93 | is assumed that an XML processor is doing its work on behalf of another module, |
94 | and <em class="rfc2119" title="Keyword in RFC 2119 context">MAY</em> recover from it.]</p></dd><dt class="label">fatal error</dt><dd><p>[<a name="dt-fatal" id="dt-fatal" title="Fatal Error">Definition</a>: An error which a conforming <a title="XML Processor" href="#dt-xml-proc">XML processor</a> <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> detect and report to the application. | 94 | called the <b>application</b>.] This specification describes |
95 | After encountering a fatal error, the processor <em class="rfc2119" title="Keyword in RFC 2119 context">MAY</em> continue processing the | 95 | the required behavior of an XML processor in terms of how it must read XML |
96 | data to search for further errors and <em class="rfc2119" title="Keyword in RFC 2119 context">MAY</em> report such errors to the application. | 96 | data and the information it must provide to the application.</p><div class="div2"> <h3><a name="sec-origin-goals" id="sec-origin-goals"/>1.1 Origin and Goals</h3><p>XML was developed by an XML Working Group (originally known as the SGML |
97 | In order to support correction of errors, the processor <em class="rfc2119" title="Keyword in RFC 2119 context">MAY</em> make unprocessed | 97 | Editorial Review Board) formed under the auspices of the World Wide Web Consortium |
98 | data from the document (with intermingled character data and markup) available | 98 | (W3C) in 1996. It was chaired by Jon Bosak of Sun Microsystems with the active |
99 | to the application. Once a fatal error is detected, however, the processor | 99 | participation of an XML Special Interest Group (previously known as the SGML |
100 | <em class="rfc2119" title="Keyword in RFC 2119 context">MUST NOT</em> continue normal processing (i.e., it <em class="rfc2119" title="Keyword in RFC 2119 context">MUST NOT</em> continue to pass character | 100 | Working Group) also organized by the W3C. The membership of the XML Working |
101 | data and information about the document's logical structure to the application | 101 | Group is given in an appendix. Dan Connolly served as the Working Group's contact with |
102 | in the normal way).]</p></dd><dt class="label">at user option</dt><dd><p>[<a name="dt-atuseroption" id="dt-atuseroption" title="At user option">Definition</a>: Conforming software | 102 | the W3C.</p><p>The design goals for XML are:</p><ol class="enumar"><li><p>XML shall be straightforwardly usable over the Internet.</p></li><li><p>XML shall support a wide variety of applications.</p></li><li><p>XML shall be compatible with SGML.</p></li><li><p>It shall be easy to write programs which process XML documents.</p></li><li><p>The number of optional features in XML is to be kept to the absolute |
103 | <em class="rfc2119" title="Keyword in RFC 2119 context">MAY</em> or <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> (depending on the modal verb in the sentence) behave as described; | 103 | minimum, ideally zero.</p></li><li><p>XML documents should be human-legible and reasonably clear.</p></li><li><p>The XML design should be prepared quickly.</p></li><li><p>The design of XML shall be formal and concise.</p></li><li><p>XML documents shall be easy to create.</p></li><li><p>Terseness in XML markup is of minimal importance.</p></li></ol><p>This specification, together with associated standards (Unicode |
104 | if it does, it <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> provide users a means to enable or disable the behavior | 104 | <a href="#Unicode">[Unicode]</a> and ISO/IEC 10646 <a href="#ISO10646">[ISO/IEC 10646]</a> |
105 | described.]</p></dd><dt class="label">validity constraint</dt><dd><p>[<a name="dt-vc" id="dt-vc" title="Validity constraint">Definition</a>: A rule which applies to | 105 | for characters, Internet RFC 3066 <a href="#RFC1766">[IETF RFC 3066]</a> for |
106 | all <a title="Validity" href="#dt-valid">valid</a> XML documents. Violations of validity | 106 | language identification tags, ISO 639 <a href="#ISO639">[ISO 639]</a> |
107 | constraints are errors; they <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em>, at user option, be reported by <a title="Validating Processor" href="#dt-validating">validating XML processors</a>.]</p></dd><dt class="label">well-formedness constraint</dt><dd><p>[<a name="dt-wfc" id="dt-wfc" title="Well-formedness constraint">Definition</a>: A rule which applies | 107 | for language name codes, and ISO 3166 <a href="#ISO3166">[ISO 3166]</a> for |
108 | to all <a title="Well-Formed" href="#dt-wellformed">well-formed</a> XML documents. Violations | 108 | country name codes), provides all the information necessary to |
109 | of well-formedness constraints are <a title="Fatal Error" href="#dt-fatal">fatal errors</a>.]</p></dd><dt class="label">match</dt><dd><p>[<a name="dt-match" id="dt-match" title="match">Definition</a>: (Of strings or names:) Two strings | 109 | understand XML Version 1.1 and construct computer |
110 | or names being compared <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> be identical. Characters with multiple possible | 110 | programs to process it.</p><p>This version of the XML specification may be distributed freely, as long as |
111 | representations in Unicode (e.g. characters with both precomposed and | 111 | all text and legal notices remain intact.</p></div><div class="div2"> <h3><a name="sec-terminology" id="sec-terminology"/>1.2 Terminology</h3><p>The terminology used to describe XML documents is defined in the body of |
112 | base+diacritic forms) match only if they have the same representation in both | 112 | this specification. The key words <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em>, <em class="rfc2119" title="Keyword in RFC 2119 context">MUST NOT</em>, <em class="rfc2119" title="Keyword in RFC 2119 context">REQUIRED</em>, <em class="rfc2119" title="Keyword in RFC 2119 context">SHALL</em>, <em class="rfc2119" title="Keyword in RFC 2119 context">SHALL NOT</em>, <em class="rfc2119" title="Keyword in RFC 2119 context">SHOULD</em>, <em class="rfc2119" title="Keyword in RFC 2119 context">SHOULD NOT</em>, <em class="rfc2119" title="Keyword in RFC 2119 context">RECOMMENDED</em>, <em class="rfc2119" title="Keyword in RFC 2119 context">MAY</em>, and <em class="rfc2119" title="Keyword in RFC 2119 context">OPTIONAL</em>, when <em class="rfc2119" title="Keyword in RFC 2119 context">EMPHASIZED</em>, are to be interpreted as described in <a href="#rfc2119">[IETF RFC 2119]</a>. In addition, the terms defined in the following list are used in building |
113 | strings. No | 113 | those definitions and in describing the actions of an XML processor:</p><dl><dt class="label">error</dt><dd><p> |
114 | case folding is performed. (Of strings and rules in the grammar:) A string | 114 | [<a name="dt-error" id="dt-error" title="Error">Definition</a>: A violation of the rules of this specification; |
115 | matches a grammatical production if it belongs to the language generated by | 115 | results are undefined. Unless otherwise specified, failure to observe a prescription of this specification indicated by one of the keywords <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em>, <em class="rfc2119" title="Keyword in RFC 2119 context">REQUIRED</em>, <em class="rfc2119" title="Keyword in RFC 2119 context">MUST NOT</em>, <em class="rfc2119" title="Keyword in RFC 2119 context">SHALL</em> and <em class="rfc2119" title="Keyword in RFC 2119 context">SHALL NOT</em> is an error. Conforming software <em class="rfc2119" title="Keyword in RFC 2119 context">MAY</em> detect and report an error |
116 | that production. (Of content and content models:) An element matches its declaration | 116 | and <em class="rfc2119" title="Keyword in RFC 2119 context">MAY</em> recover from it.] |
117 | when it conforms in the fashion described in the constraint <b>[VC: <a href="#elementvalid">Element Valid</a>]</b>.]</p></dd><dt class="label">for compatibility</dt><dd><p>[<a name="dt-compat" id="dt-compat" title="For Compatibility">Definition</a>: Marks | 117 | </p></dd><dt class="label">fatal error</dt><dd><p> |
118 | a sentence describing a feature of XML included solely to ensure | 118 | [<a name="dt-fatal" id="dt-fatal" title="Fatal Error">Definition</a>: An error which a conforming <a title="XML Processor" href="#dt-xml-proc">XML processor</a> |
119 | that XML remains compatible with SGML.]</p></dd><dt class="label">for interoperability</dt><dd><p>[<a name="dt-interop" id="dt-interop" title="For interoperability">Definition</a>: Marks | 119 | <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> detect and report to the application. |
120 | a sentence describing a non-binding recommendation included to increase | 120 | After encountering a fatal error, the processor <em class="rfc2119" title="Keyword in RFC 2119 context">MAY</em> continue processing the |
121 | the chances that XML documents can be processed by the existing installed | 121 | data to search for further errors and <em class="rfc2119" title="Keyword in RFC 2119 context">MAY</em> report such errors to the application. |
122 | base of SGML processors which predate the WebSGML Adaptations Annex to ISO 8879.]</p></dd></dl><p></p></div><div class="div2"> <h3><a name="sec-xml11" id="sec-xml11" />1.3 Rationale and list of changes for XML 1.1</h3><p>The W3C's XML 1.0 Recommendation was first issued in 1998, and | 122 | In order to support correction of errors, the processor <em class="rfc2119" title="Keyword in RFC 2119 context">MAY</em> make unprocessed |
123 | despite the issuance of many errata culminating in a Third Edition | 123 | data from the document (with intermingled character data and markup) available |
124 | of 2004, has remained (by intention) unchanged with respect to what | 124 | to the application. Once a fatal error is detected, however, the processor |
125 | is well-formed XML and what is not. This stability has been | 125 | <em class="rfc2119" title="Keyword in RFC 2119 context">MUST NOT</em> continue normal processing (i.e., it <em class="rfc2119" title="Keyword in RFC 2119 context">MUST NOT</em> continue to pass character |
126 | extremely useful for interoperability. However, the Unicode | 126 | data and information about the document's logical structure to the application |
127 | Standard on which XML 1.0 relies for character specifications has | 127 | in the normal way).] |
128 | not remained static, evolving from version 2.0 to version 4.0 and | 128 | </p></dd><dt class="label">at user option</dt><dd><p> |
129 | beyond. Characters not present in Unicode 2.0 may already be used | 129 | [<a name="dt-atuseroption" id="dt-atuseroption" title="At user option">Definition</a>: Conforming software |
130 | in XML 1.0 character data. However, they are not allowed in XML | 130 | <em class="rfc2119" title="Keyword in RFC 2119 context">MAY</em> or <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> (depending on the modal verb in the sentence) behave as described; |
131 | names such as element type names, attribute names, enumerated | 131 | if it does, it <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> provide users a means to enable or disable the behavior |
132 | attribute values, processing instruction targets, and so on. In | 132 | described.] |
133 | addition, some characters that should have been permitted in XML | 133 | </p></dd><dt class="label">validity constraint</dt><dd><p> |
134 | names were not, due to oversights and inconsistencies in Unicode | 134 | [<a name="dt-vc" id="dt-vc" title="Validity constraint">Definition</a>: A rule which applies to |
135 | 2.0.</p><p>The overall philosophy of names has changed since XML 1.0. | 135 | all <a title="Validity" href="#dt-valid">valid</a> XML documents. Violations of validity |
136 | Whereas XML 1.0 provided a rigid definition of names, wherein | 136 | constraints are errors; they <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em>, at user option, be reported by <a title="Validating Processor" href="#dt-validating">validating XML processors</a>.] |
137 | everything that was not permitted was forbidden, XML 1.1 names are | 137 | </p></dd><dt class="label">well-formedness constraint</dt><dd><p> |
138 | designed so that everything that is not forbidden (for a specific | 138 | [<a name="dt-wfc" id="dt-wfc" title="Well-formedness constraint">Definition</a>: A rule which applies |
139 | reason) is permitted. Since Unicode will continue to grow past | 139 | to all <a title="Well-Formed" href="#dt-wellformed">well-formed</a> XML documents. Violations |
140 | version 4.0, further changes to XML can be avoided by allowing | 140 | of well-formedness constraints are <a title="Fatal Error" href="#dt-fatal">fatal errors</a>.] |
141 | almost any character, including those not yet assigned, in | 141 | </p></dd><dt class="label">match</dt><dd><p> |
142 | names.</p><p>In addition, XML 1.0 attempts to adapt to the line-end | 142 | [<a name="dt-match" id="dt-match" title="match">Definition</a>: (Of strings or names:) Two strings |
143 | conventions of various modern operating systems, but discriminates | 143 | or names being compared |
144 | against the conventions used on IBM and IBM-compatible mainframes. | 144 | <span>are</span> identical. Characters with multiple possible |
145 | As a result, XML documents on mainframes are not plain text files | 145 | representations in Unicode (e.g. characters with both precomposed and |
146 | according to the local conventions. XML 1.0 documents generated on | 146 | base+diacritic forms) match only if they have the same representation in both |
147 | mainframes must either violate the local line-end conventions, or | 147 | strings. No |
148 | employ otherwise unnecessary translation phases before parsing and | 148 | case folding is performed. (Of strings and rules in the grammar:) A string |
149 | after generation. Allowing straightforward interoperability is | 149 | matches a grammatical production if it belongs to the language generated by |
150 | particularly important when data stores are shared between | 150 | that production. (Of content and content models:) An element matches its declaration |
151 | mainframe and non-mainframe systems (as opposed to being copied | 151 | when it conforms in the fashion described in the constraint <b>[VC: <a href="#elementvalid">Element Valid</a>]</b>.] |
152 | from one to the other). Therefore XML 1.1 adds NEL (#x85) to the | 152 | </p></dd><dt class="label">for compatibility</dt><dd><p> |
153 | list of line-end characters. For completeness, the Unicode line | 153 | [<a name="dt-compat" id="dt-compat" title="For Compatibility">Definition</a>: Marks |
154 | separator character, #x2028, is also supported. | 154 | a sentence describing a feature of XML included solely to ensure |
155 | </p><p>Finally, there is considerable demand to define a standard representation | 155 | that XML remains compatible with SGML.] |
156 | of arbitrary Unicode characters in XML documents. Therefore, XML 1.1 | 156 | </p></dd><dt class="label">for interoperability</dt><dd><p> |
157 | allows the use of character references to the control characters #x1 through | 157 | [<a name="dt-interop" id="dt-interop" title="For interoperability">Definition</a>: Marks |
158 | #x1F, most of which are forbidden in XML 1.0. For reasons of robustness, | 158 | a sentence describing a non-binding recommendation included to increase |
159 | however, these characters still cannot be used directly in documents. In | 159 | the chances that XML documents can be processed by the existing installed |
160 | order to improve the robustness of character encoding detection, the additional | 160 | base of SGML processors which predate the WebSGML Adaptations Annex to ISO 8879.] |
161 | control characters #x7F through #x9F, which were freely allowed in XML 1.0 | 161 | </p></dd></dl><p> |
162 | documents, now must also appear only as character references. (Whitespace | 162 | </p></div><div class="div2"> <h3><a name="sec-xml11" id="sec-xml11"/>1.3 Rationale and list of changes for XML 1.1</h3><p>The W3C's XML 1.0 Recommendation was first issued in 1998, and |
163 | characters are of course exempt.) The minor sacrifice of backward compatibility | 163 | despite the issuance of many errata culminating in a Third Edition |
164 | is considered not significant. Due to potential problems with APIs, | 164 | of 2004, has remained (by intention) unchanged with respect to what |
165 | #x0 is still forbidden both directly and as a character reference. | 165 | is well-formed XML and what is not. This stability has been |
166 | </p><p>Finally, XML 1.1 defines a set of constraints called "full | 166 | extremely useful for interoperability. However, the Unicode |
167 | normalization" on XML documents, which document creators | 167 | Standard on which XML 1.0 relies for character specifications has |
168 | <em class="rfc2119" title="Keyword in RFC 2119 context">SHOULD</em> adhere to, and document processors | 168 | not remained static, evolving from version 2.0 to version 4.0 and |
169 | <em class="rfc2119" title="Keyword in RFC 2119 context">SHOULD</em> verify. Using fully normalized documents | 169 | beyond. Characters not present in Unicode 2.0 may already be used |
170 | ensures that identity comparisons of names, attribute values, and | 170 | in XML 1.0 character data. However, they are not allowed in XML |
171 | character content can be made correctly by simple binary comparison of | 171 | names such as element type names, attribute names, enumerated |
172 | Unicode strings.</p><p>A new XML version, rather than a set of errata to XML 1.0, is | 172 | attribute values, processing instruction targets, and so on. In |
173 | being created because the changes affect the definition of | 173 | addition, some characters that should have been permitted in XML |
174 | well-formed documents. XML 1.0 processors must continue to reject | 174 | names were not, due to oversights and inconsistencies in Unicode |
175 | documents that contain new characters in XML names, new line-end | 175 | 2.0.</p><p>The overall philosophy of names has changed since XML 1.0. |
176 | conventions, and references to control characters. The distinction between XML 1.0 and XML 1.1 documents | 176 | Whereas XML 1.0 provided a rigid definition of names, wherein |
177 | is indicated by the version number information in the XML | 177 | everything that was not permitted was forbidden, XML 1.1 names are |
178 | declaration at the start of each document. | 178 | designed so that everything that is not forbidden (for a specific |
179 | </p></div></div><div class="div1"> <h2><a name="sec-documents" id="sec-documents" />2 Documents</h2><p>[<a name="dt-xml-doc" id="dt-xml-doc" title="XML Document">Definition</a>: A data object is an <b>XML | 179 | reason) is permitted. Since Unicode will continue to grow past |
180 | document</b> if it is <a title="Well-Formed" href="#dt-wellformed">well-formed</a>, | 180 | version 4.0, further changes to XML can be avoided by allowing |
181 | as defined in this specification. A well-formed XML document <em class="rfc2119" title="Keyword in RFC 2119 context">MAY</em> in addition | 181 | almost any character, including those not yet assigned, in |
182 | be <a title="Validity" href="#dt-valid">valid</a> if it meets certain further constraints.]</p><p>Each XML document has both a logical and a physical structure. Physically, | 182 | names.</p><p>In addition, XML 1.0 attempts to adapt to the line-end |
183 | the document is composed of units called <a title="Entity" href="#dt-entity">entities</a>. | 183 | conventions of various modern operating systems, but discriminates |
184 | An entity <em class="rfc2119" title="Keyword in RFC 2119 context">MAY</em> <a title="Entity Reference" href="#dt-entref">refer</a> to other entities to | 184 | against the conventions used on IBM and IBM-compatible mainframes. |
185 | cause their inclusion in the document. A document begins in a "root" | 185 | As a result, XML documents on mainframes are not plain text files |
186 | or <a title="Document Entity" href="#dt-docent">document entity</a>. Logically, the document | 186 | according to the local conventions. XML 1.0 documents generated on |
187 | is composed of declarations, elements, comments, character references, and | 187 | mainframes must either violate the local line-end conventions, or |
188 | processing instructions, all of which are indicated in the document by explicit | 188 | employ otherwise unnecessary translation phases before parsing and |
189 | markup. The logical and physical structures <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> nest properly, as described | 189 | after generation. Allowing straightforward interoperability is |
190 | in <a href="#wf-entities"><b>4.3.2 Well-Formed Parsed Entities</b></a>.</p><div class="div2"> <h3><a name="sec-well-formed" id="sec-well-formed" />2.1 Well-Formed XML Documents</h3><p>[<a name="dt-wellformed" id="dt-wellformed" title="Well-Formed">Definition</a>: A textual object is a <b>well-formed</b> | 190 | particularly important when data stores are shared between |
191 | XML document if:]</p><ol type="1"><li><p>Taken as a whole, it matches the production labeled <a href="#NT-document">document</a>.</p></li><li><p>It meets all the well-formedness constraints given in this specification.</p></li><li><p>Each of the <a title="Text Entity" href="#dt-parsedent">parsed entities</a> | 191 | mainframe and non-mainframe systems (as opposed to being copied |
192 | which is referenced directly or indirectly within the document is <a | 192 | from one to the other). Therefore XML 1.1 adds NEL (#x85) to the |
193 | title="Well-Formed" | 193 | list of line-end characters. For completeness, the Unicode line |
194 | href="#dt-wellformed">well-formed</a>.</p></li></ol> <h5><a | 194 | separator character, #x2028, is also supported. |
195 | name="document" id="document" />Document</h5><table class="scrap" | 195 | </p><p>Finally, there is considerable demand to define a standard representation |
196 | summary="Scrap"><tbody><tr valign="baseline"><td><a name="NT-document" | 196 | of arbitrary Unicode characters in XML documents. Therefore, XML 1.1 |
197 | id="NT-document" | 197 | allows the use of character references to the control characters #x1 through |
198 | />[1] </td><td><code>document</code></td><td> ::= </td><td><code><a | 198 | #x1F, most of which are forbidden in XML 1.0. For reasons of robustness, |
199 | href="#NT-prolog">prolog</a> <a href="#NT-element">element</a> <a | 199 | however, these characters still cannot be used directly in documents. In |
200 | href="#NT-Misc">Misc</a>* - <a href="#NT-Char">Char</a>* <a | 200 | order to improve the robustness of character encoding detection, the additional |
201 | href="#NT-RestrictedChar">RestrictedChar</a> <a href="#NT-Char">Char</a>*</code></td></tr></tbody></table><p>Matching the <a href="#NT-document">document</a> production implies that:</p><ol type="1"><li><p>It contains one or more <a title="Element" href="#dt-element">elements</a>.</p></li><li><p>[<a name="dt-root" id="dt-root" title="Root Element">Definition</a>: There is exactly one element, | 201 | control characters #x7F through #x9F, which were freely allowed in XML 1.0 |
202 | called the <b>root</b>, or document element, no part of which appears | 202 | documents, now must also appear only as character references. (Whitespace |
203 | in the <a title="Content" href="#dt-content">content</a> of any other element.] For | 203 | characters are of course exempt.) The minor sacrifice of backward compatibility |
204 | all other elements, if the <a title="Start-Tag" href="#dt-stag">start-tag</a> is in | 204 | is considered not significant. Due to potential problems with APIs, |
205 | the content of another element, the <a title="End Tag" href="#dt-etag">end-tag</a> | 205 | #x0 is still forbidden both directly and as a character reference. |
206 | is in the content of the same element. More simply stated, the elements, | 206 | </p><p>Finally, XML 1.1 defines a set of constraints called "full |
207 | delimited by start- and end-tags, nest properly within each other.</p></li></ol><p>[<a name="dt-parentchild" id="dt-parentchild" title="Parent/Child">Definition</a>: As a consequence of this, | 207 | normalization" on XML documents, which document creators |
208 | for each non-root element <code>C</code> in the document, there is one other element <code>P</code> | 208 | <em class="rfc2119" title="Keyword in RFC 2119 context">SHOULD</em> adhere to, and document processors |
209 | in the document such that <code>C</code> is in the content of <code>P</code>, but | 209 | <em class="rfc2119" title="Keyword in RFC 2119 context">SHOULD</em> verify. Using fully normalized documents |
210 | is not in the content of any other element that is in the content of <code>P</code>. <code>P</code> | 210 | ensures that identity comparisons of names, attribute values, and |
211 | is referred to as the <b>parent</b> of <code>C</code>, and <code>C</code> as | 211 | character content can be made correctly by simple binary comparison of |
212 | a <b>child</b> of <code>P</code>.]</p></div><div class="div2"> <h3><a name="charsets" id="charsets" />2.2 Characters</h3><p>[<a name="dt-text" id="dt-text" title="Text">Definition</a>: A parsed entity contains <b>text</b>, | 212 | Unicode strings.</p><p>A new XML version, rather than a set of errata to XML 1.0, is |
213 | a sequence of <a title="Character" href="#dt-character">characters</a>, which may | 213 | being created because the changes affect the definition of |
214 | represent markup or character data.] [<a name="dt-character" id="dt-character" title="Character">Definition</a>: A <b>character</b> | 214 | well-formed documents. XML 1.0 processors must continue to reject |
215 | is an atomic unit of text as specified by <span>ISO/IEC 10646 <a href="#ISO10646">[ISO/IEC 10646]</a></span>. Legal characters are tab, carriage | 215 | documents that contain new characters in XML names, new line-end |
216 | return, line feed, and the legal characters | 216 | conventions, and references to control characters. The distinction between XML 1.0 and XML 1.1 documents |
217 | of Unicode and ISO/IEC 10646. The | 217 | is indicated by the version number information in the XML |
218 | versions of these standards cited in <a href="#sec-existing-stds"><b>A.1 Normative References</b></a> were | 218 | declaration at the start of each document. |
219 | current at the time this document was prepared. New characters may be added | 219 | </p></div></div><div class="div1"> <h2><a name="sec-documents" id="sec-documents"/>2 Documents</h2><p> |
220 | to these standards by amendments or new editions. Consequently, XML processors | 220 | [<a name="dt-xml-doc" id="dt-xml-doc" title="XML Document">Definition</a>: A data object is an <b>XML |
221 | <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> accept any character in the range specified for <a href="#NT-Char">Char</a>.]</p> <h5><a name="char32" id="char32" />Character Range</h5><table class="scrap" summary="Scrap"><tbody><tr valign="baseline"><td><a name="NT-Char" id="NT-Char" />[2] </td><td><code>Char</code></td><td> ::= </td><td><code>[#x1-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]</code></td><td><i>/* any Unicode character, excluding the surrogate blocks, FFFE, and FFFF. */</i></td></tr><tr valign="baseline"><td><a name="NT-RestrictedChar" id="NT-RestrictedChar" />[2a] </td><td><code>RestrictedChar</code></td><td> ::= </td><td><code>[#x1-#x8] | [#xB-#xC] | [#xE-#x1F] | [#x7F-#x84] | [#x86-#x9F]</code></td></tr></tbody></table><p>The mechanism for encoding character code points into bit patterns <em class="rfc2119" title="Keyword in RFC 2119 context">MAY</em> | 221 | document</b> if it is <a title="Well-Formed" href="#dt-wellformed">well-formed</a>, |
222 | vary from entity to entity. All XML processors <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> accept the UTF-8 and UTF-16 | 222 | as defined in this specification. <span>In addition, the XML document is</span> |
223 | encodings of <span> Unicode | 223 | <a title="Validity" href="#dt-valid">valid</a> if it meets certain further constraints.] |
224 | <a href="#Unicode">[Unicode]</a></span>; | 224 | </p><p>Each XML document has both a logical and a physical structure. Physically, |
225 | the mechanisms for signaling which of the two is in use, | 225 | the document is composed of units called <a title="Entity" href="#dt-entity">entities</a>. |
226 | or for bringing other encodings into play, are discussed later, in <a href="#charencoding"><b>4.3.3 Character Encoding in Entities</b></a>.</p><div class="note"><p class="prefix"><b>Note:</b></p><p>Document authors are encouraged to avoid | 226 | An entity <span>may</span> <a title="Entity Reference" href="#dt-entref">refer</a> to other entities to |
227 | "compatibility characters", as defined | 227 | cause their inclusion in the document. A document begins in a "root" |
228 | in Unicode <a href="#Unicode">[Unicode]</a>. | 228 | or <a title="Document Entity" href="#dt-docent">document entity</a>. Logically, the document |
229 | The characters defined in the following ranges are also | 229 | is composed of declarations, elements, comments, character references, and |
230 | discouraged. They are either control characters or permanently undefined Unicode | 230 | processing instructions, all of which are indicated in the document by explicit |
231 | characters:</p><div class="exampleInner"><pre> | 231 | markup. The logical and physical structures <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> nest properly, as described |
232 | [#x7F-#x84], [#x86-#x9F], [#xFDD0-#xFDDF], | 232 | in <a href="#wf-entities"><b>4.3.2 Well-Formed Parsed Entities</b></a>.</p><div class="div2"> <h3><a name="sec-well-formed" id="sec-well-formed"/>2.1 Well-Formed XML Documents</h3><p> |
233 | [#1FFFE-#x1FFFF], [#2FFFE-#x2FFFF], [#3FFFE-#x3FFFF], | 233 | [<a name="dt-wellformed" id="dt-wellformed" title="Well-Formed">Definition</a>: A textual object is a <b>well-formed</b> |
234 | [#4FFFE-#x4FFFF], [#5FFFE-#x5FFFF], [#6FFFE-#x6FFFF], | 234 | XML document if:] |
235 | [#7FFFE-#x7FFFF], [#8FFFE-#x8FFFF], [#9FFFE-#x9FFFF], | 235 | </p><ol class="enumar"><li><p>Taken as a whole, it matches the production labeled <a href="#NT-document">document</a>.</p></li><li><p>It meets all the well-formedness constraints given in this specification.</p></li><li><p>Each of the <a title="Text Entity" href="#dt-parsedent">parsed entities</a> |
236 | [#AFFFE-#xAFFFF], [#BFFFE-#xBFFFF], [#CFFFE-#xCFFFF], | 236 | which is referenced directly or indirectly within the document is <a title="Well-Formed" href="#dt-wellformed">well-formed</a>.</p></li></ol> <h5><a name="document" id="document"/>Document</h5><table class="scrap" summary="Scrap"><tbody><tr valign="baseline"><td><a name="NT-document" id="NT-document"/>[1] </td><td><code>document</code></td><td> ::= </td><td><code> |
237 | [#DFFFE-#xDFFFF], [#EFFFE-#xEFFFF], [#FFFFE-#xFFFFF], | 237 | <span> |
238 | [#10FFFE-#x10FFFF].</pre></div></div></div><div class="div2"> <h3><a name="sec-common-syn" id="sec-common-syn" />2.3 Common Syntactic Constructs</h3><p>This section defines some symbols used widely in the grammar.</p><p><a href="#NT-S">S</a> (white space) consists of one or more space (#x20) | 238 | (</span> |
239 | characters, carriage returns, line feeds, or tabs.</p> <h5><a name="white" id="white" />White Space</h5><table class="scrap" summary="Scrap"><tbody><tr valign="baseline"><td><a name="NT-S" id="NT-S" />[3] </td><td><code>S</code></td><td> ::= </td><td><code>(#x20 | #x9 | #xD | #xA)+</code></td></tr></tbody></table><div class="note"><p class="prefix"><b>Note:</b></p><p>The presence of #xD in the above production is | 239 | <a href="#NT-prolog">prolog</a> |
240 | maintained purely for backward compatibility with the | 240 | <a href="#NT-element">element</a> |
241 | <a href="http://www.w3.org/TR/1998/REC-xml-19980210">First Edition</a>. | 241 | <a href="#NT-Misc">Misc</a>*<span> |
242 | As explained in <a href="#sec-line-ends"><b>2.11 End-of-Line Handling</b></a>, | 242 | )</span> |
243 | all #xD characters literally present in an XML document | 243 | <span> |
244 | are either removed or replaced by #xA characters before | 244 | |
245 | any other processing is done. The only way to get a #xD character to match this production is to | 245 | </span>- <span> |
246 | use a character reference in an entity value literal.</p></div><p>[<a name="dt-name" id="dt-name" title="Name">Definition</a>: A <b>Name</b> is a token beginning | 246 | (</span> |
247 | with a letter or one of a few punctuation characters, and continuing with | 247 | <a href="#NT-Char">Char</a>* <a href="#NT-RestrictedChar">RestrictedChar</a> |
248 | letters, digits, hyphens, underscores, colons, or full stops, together known | 248 | <span> |
249 | as name characters.] Names beginning with the string "<code>xml</code>", | 249 | |
250 | or <span>with</span> any string which would match <code>(('X'|'x') ('M'|'m') ('L'|'l'))</code>, | 250 | </span> |
251 | are reserved for standardization in this or future versions of this specification.</p><div class="note"><p class="prefix"><b>Note:</b></p><p>The | 251 | <a href="#NT-Char">Char</a>*<span> |
252 | Namespaces in XML Recommendation <a href="#xml-names">[XML Names]</a> assigns a meaning | 252 | )</span> |
253 | to names containing colon characters. Therefore, authors should not use the | 253 | </code></td></tr></tbody></table><p>Matching the <a href="#NT-document">document</a> production implies that:</p><ol class="enumar"><li><p>It contains one or more <a title="Element" href="#dt-element">elements</a>.</p></li><li><p> |
254 | colon in XML names except for namespace purposes, but XML processors must | 254 | [<a name="dt-root" id="dt-root" title="Root Element">Definition</a>: There is exactly one element, |
255 | accept the colon as a name character.</p></div><p>An <a href="#NT-Nmtoken">Nmtoken</a> (name token) is any mixture of name | 255 | called the <b>root</b>, or document element, no part of which appears |
256 | characters.</p><p>The first character of a Name <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> be a NameStartChar, and any | 256 | in the <a title="Content" href="#dt-content">content</a> of any other element.] For |
257 | other characters <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> be NameChars; this mechanism is used to | 257 | all other elements, if the <a title="Start-Tag" href="#dt-stag">start-tag</a> is in |
258 | prevent names from beginning with European (ASCII) digits or with | 258 | the content of another element, the <a title="End Tag" href="#dt-etag">end-tag</a> |
259 | basic combining characters. Almost all characters are permitted in | 259 | is in the content of the same element. More simply stated, the elements, |
260 | names, except those which either are or reasonably could be used as | 260 | delimited by start- and end-tags, nest properly within each other.</p></li></ol><p> |
261 | delimiters. The intention is to be inclusive rather than exclusive, | 261 | [<a name="dt-parentchild" id="dt-parentchild" title="Parent/Child">Definition</a>: As a consequence of this, |
262 | so that writing systems not yet encoded in Unicode can be used in | 262 | for each non-root element <code>C</code> in the document, there is one other element <code>P</code> |
263 | XML names. See <a href="#sec-suggested-names"><b>I Suggestions for XML Names</b></a> for suggestions on the creation of | 263 | in the document such that <code>C</code> is in the content of <code>P</code>, but |
264 | names.</p><p>Document authors are encouraged to use names which are | 264 | is not in the content of any other element that is in the content of <code>P</code>. <code>P</code> |
265 | meaningful words or combinations of words in natural languages, and | 265 | is referred to as the <b>parent</b> of <code>C</code>, and <code>C</code> as |
266 | to avoid symbolic or white space characters in names. Note that | 266 | a <b>child</b> of <code>P</code>.] |
267 | COLON, HYPHEN-MINUS, FULL STOP (period), LOW LINE (underscore), and | 267 | </p></div><div class="div2"> <h3><a name="charsets" id="charsets"/>2.2 Characters</h3><p> |
268 | MIDDLE DOT are explicitly permitted.</p><p>The ASCII symbols and punctuation marks, along with a fairly | 268 | [<a name="dt-text" id="dt-text" title="Text">Definition</a>: A parsed entity contains <b>text</b>, |
269 | large group of Unicode symbol characters, are excluded from names | 269 | a sequence of <a title="Character" href="#dt-character">characters</a>, which may |
270 | because they are more useful as delimiters in contexts where XML | 270 | represent markup or character data.] |
271 | names are used outside XML documents; providing this group gives | 271 | [<a name="dt-character" id="dt-character" title="Character">Definition</a>: A <b>character</b> |
272 | those contexts hard guarantees about what <em>cannot</em> be part of | 272 | is an atomic unit of text as specified by ISO/IEC 10646 <a href="#ISO10646">[ISO/IEC 10646]</a>. Legal characters are tab, carriage |
273 | an XML name. The character #x037E, GREEK QUESTION MARK, is excluded | 273 | return, line feed, and the legal characters |
274 | because when normalized it becomes a semicolon, which could change | 274 | of Unicode and ISO/IEC 10646. The |
275 | the meaning of entity references.</p> <h5><a name="IDABN1S" id="IDABN1S" />Names and Tokens</h5><table class="scrap" summary="Scrap"><tbody><tr valign="baseline"><td><a name="NT-NameStartChar" id="NT-NameStartChar" />[4] </td><td><code>NameStartChar</code></td><td> ::= </td><td><code>":" | [A-Z] | "_" | [a-z] | [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF] | [#x370-#x37D] | [#x37F-#x1FFF] | [#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF] | [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] | [#x10000-#xEFFFF]</code></td></tr></tbody><tbody><tr valign="baseline"><td><a name="NT-NameChar" id="NT-NameChar" />[4a] </td><td><code>NameChar</code></td><td> ::= </td><td><code><a href="#NT-NameStartChar">NameStartChar</a> | "-" | "." | [0-9] | #xB7 | [#x0300-#x036F] | [#x203F-#x2040]</code></td></tr></tbody><tbody><tr valign="baseline"><td><a name="NT-Name" id="NT-Name" />[5] </td><td><code>Name</code></td><td> ::= </td><td><code><a href="#NT-NameStartChar">NameStartChar</a> (<a href="#NT-NameChar">NameChar</a>)*</code></td></tr></tbody><tbody><tr valign="baseline"><td><a name="NT-Names" id="NT-Names" />[6] </td><td><code>Names</code></td><td> ::= </td><td><code><a href="#NT-Name">Name</a> (#x20 <a href="#NT-Name">Name</a>)*</code></td></tr></tbody><tbody><tr valign="baseline"><td><a name="NT-Nmtoken" id="NT-Nmtoken" />[7] </td><td><code>Nmtoken</code></td><td> ::= </td><td><code>(<a href="#NT-NameChar">NameChar</a>)+</code></td></tr></tbody><tbody><tr valign="baseline"><td><a name="NT-Nmtokens" id="NT-Nmtokens" />[8] </td><td><code>Nmtokens</code></td><td> ::= </td><td><code><a href="#NT-Nmtoken">Nmtoken</a> (#x20 <a href="#NT-Nmtoken">Nmtoken</a>)*</code></td></tr></tbody></table><div class="note"><p class="prefix"><b>Note:</b></p><p>The <a href="#NT-Names">Names</a> | 275 | versions of these standards cited in <a href="#sec-existing-stds"><b>A.1 Normative References</b></a> were |
276 | and <a href="#NT-Nmtokens">Nmtokens</a> productions are used to define the validity | 276 | current at the time this document was prepared. New characters may be added |
277 | of tokenized attribute values after normalization (see <a href="#sec-attribute-types"><b>3.3.1 Attribute Types</b></a>).</p></div><p>Literal data is any quoted string not containing the quotation mark used | 277 | to these standards by amendments or new editions. Consequently, XML processors |
278 | as a delimiter for that string. Literals are used for specifying the content | 278 | <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> accept any character in the range specified for <a href="#NT-Char">Char</a>.] |
279 | of internal entities (<a href="#NT-EntityValue">EntityValue</a>), the values | 279 | </p> <h5><a name="char32" id="char32"/>Character Range</h5><table class="scrap" summary="Scrap"><tbody><tr valign="baseline"><td><a name="NT-Char" id="NT-Char"/>[2] </td><td><code>Char</code></td><td> ::= </td><td><code>[#x1-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]</code></td><td><i>/* any Unicode character, excluding the surrogate blocks, FFFE, and FFFF. */</i></td></tr><tr valign="baseline"><td><a name="NT-RestrictedChar" id="NT-RestrictedChar"/>[2a] </td><td><code>RestrictedChar</code></td><td> ::= </td><td><code>[#x1-#x8] | [#xB-#xC] | [#xE-#x1F] | [#x7F-#x84] | [#x86-#x9F]</code></td></tr></tbody></table><p>The mechanism for encoding character code points into bit patterns <span>may</span> |
280 | of attributes (<a href="#NT-AttValue">AttValue</a>), and external identifiers | 280 | vary from entity to entity. All XML processors <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> accept the UTF-8 and UTF-16 |
281 | (<a href="#NT-SystemLiteral">SystemLiteral</a>). Note that a <a href="#NT-SystemLiteral">SystemLiteral</a> | 281 | encodings of Unicode <a href="#Unicode">[Unicode]</a>; |
282 | can be parsed without scanning for markup.</p> <h5><a name="IDAFR1S" id="IDAFR1S" />Literals</h5><table class="scrap" summary="Scrap"><tbody><tr valign="baseline"><td><a name="NT-EntityValue" id="NT-EntityValue" />[9] </td><td><code>EntityValue</code></td><td> ::= </td><td><code>'"' ([^%&"] | <a href="#NT-PEReference">PEReference</a> | 282 | the mechanisms for signaling which of the two is in use, |
283 | | <a href="#NT-Reference">Reference</a>)* '"' </code></td></tr><tr valign="baseline"><td /><td /><td /><td><code>| "'" ([^%&'] | <a href="#NT-PEReference">PEReference</a> | <a href="#NT-Reference">Reference</a>)* "'"</code></td></tr></tbody><tbody><tr valign="baseline"><td><a name="NT-AttValue" id="NT-AttValue" />[10] </td><td><code>AttValue</code></td><td> ::= </td><td><code>'"' ([^<&"] | <a href="#NT-Reference">Reference</a>)* | 283 | or for bringing other encodings into play, are discussed later, in <a href="#charencoding"><b>4.3.3 Character Encoding in Entities</b></a>.</p><div class="note"><p class="prefix"><b>Note:</b></p><p>Document authors are encouraged to avoid |
284 | '"' </code></td></tr><tr valign="baseline"><td /><td /><td /><td><code>| "'" ([^<&'] | <a href="#NT-Reference">Reference</a>)* | 284 | "compatibility characters", as defined |
285 | "'"</code></td></tr></tbody><tbody><tr valign="baseline"><td><a name="NT-SystemLiteral" id="NT-SystemLiteral" />[11] </td><td><code>SystemLiteral</code></td><td> ::= </td><td><code>('"' [^"]* '"') | ("'" [^']* "'") </code></td></tr></tbody><tbody><tr valign="baseline"><td><a name="NT-PubidLiteral" id="NT-PubidLiteral" />[12] </td><td><code>PubidLiteral</code></td><td> ::= </td><td><code>'"' <a href="#NT-PubidChar">PubidChar</a>* '"' | 285 | in Unicode <a href="#Unicode">[Unicode]</a>. |
286 | | "'" (<a href="#NT-PubidChar">PubidChar</a> - "'")* "'"</code></td></tr></tbody><tbody><tr valign="baseline"><td><a name="NT-PubidChar" id="NT-PubidChar" />[13] </td><td><code>PubidChar</code></td><td> ::= </td><td><code>#x20 | #xD | #xA | [a-zA-Z0-9] | [-'()+,./:=?;!*#@$_%]</code></td></tr></tbody></table><div class="note"><p class="prefix"><b>Note:</b></p><p>Although | 286 | The characters defined in the following ranges are also |
287 | the <a href="#NT-EntityValue">EntityValue</a> production allows the definition | 287 | discouraged. They are either control characters or permanently undefined Unicode |
288 | of a general entity consisting of a single explicit <code><</code> in the literal | 288 | characters:</p><div class="exampleInner"><pre><span>[#x1-#x8], [#xB-#xC], [#xE-#x1F], </span>[#x7F-#x84], [#x86-#x9F], [#xFDD0-#xFDDF], |
289 | (e.g., <code><!ENTITY mylt "<"></code>), it is strongly advised to avoid | 289 | [#<span>x</span>1FFFE-#x1FFFF], [#<span>x</span>2FFFE-#x2FFFF], [#<span>x</span>3FFFE-#x3FFFF], |
290 | this practice since any reference to that entity will cause a well-formedness | 290 | [#<span>x</span>4FFFE-#x4FFFF], [#<span>x</span>5FFFE-#x5FFFF], [#<span>x</span>6FFFE-#x6FFFF], |
291 | error.</p></div></div><div class="div2"> <h3><a name="syntax" id="syntax" />2.4 Character Data and Markup</h3><p><a title="Text" href="#dt-text">Text</a> consists of intermingled <a title="Character Data" href="#dt-chardata">character data</a> and markup. [<a name="dt-markup" id="dt-markup" title="Markup">Definition</a>: <b>Markup</b> takes the form of <a title="Start-Tag" href="#dt-stag">start-tags</a>, <a title="End Tag" href="#dt-etag">end-tags</a>, <a title="Empty" href="#dt-empty">empty-element tags</a>, <a title="Entity Reference" href="#dt-entref">entity references</a>, <a title="Character Reference" href="#dt-charref">character | 291 | [#<span>x</span>7FFFE-#x7FFFF], [#<span>x</span>8FFFE-#x8FFFF], [#<span>x</span>9FFFE-#x9FFFF], |
292 | references</a>, <a title="Comment" href="#dt-comment">comments</a>, <a title="CDATA Section" href="#dt-cdsection">CDATA section</a> delimiters, <a title="Document Type Declaration" href="#dt-doctype">document | 292 | [#<span>x</span>AFFFE-#xAFFFF], [#<span>x</span>BFFFE-#xBFFFF], [#<span>x</span>CFFFE-#xCFFFF], |
293 | type declarations</a>, <a title="Processing instruction" href="#dt-pi">processing instructions</a>, <a href="#NT-XMLDecl">XML declarations</a>, <a href="#NT-TextDecl">text declarations</a>, | 293 | [#<span>x</span>DFFFE-#xDFFFF], [#<span>x</span>EFFFE-#xEFFFF], [#<span>x</span>FFFFE-#xFFFFF], |
294 | and any white space that is at the top level of the document entity (that | 294 | [#<span>x</span>10FFFE-#x10FFFF].</pre></div></div></div><div class="div2"> <h3><a name="sec-common-syn" id="sec-common-syn"/>2.3 Common Syntactic Constructs</h3><p>This section defines some symbols used widely in the grammar.</p><p> |
295 | is, outside the document element and not inside any other markup).]</p><p>[<a name="dt-chardata" id="dt-chardata" title="Character Data">Definition</a>: All text that is not markup | 295 | <a href="#NT-S">S</a> (white space) consists of one or more space (#x20) |
296 | constitutes the <b>character data</b> of the document.]</p><p>The ampersand character (&) and the left angle bracket (<) <span class="mustard"><em class="rfc2119" title="Keyword in RFC 2119 context">MUST NOT</em></span> appear | 296 | characters, carriage returns, line feeds, or tabs.</p> <h5><a name="white" id="white"/>White Space</h5><table class="scrap" summary="Scrap"><tbody><tr valign="baseline"><td><a name="NT-S" id="NT-S"/>[3] </td><td><code>S</code></td><td> ::= </td><td><code>(#x20 | #x9 | #xD | #xA)+</code></td></tr></tbody></table><div class="note"><p class="prefix"><b>Note:</b></p><p>The presence of #xD in the above production is |
297 | in their literal form<span class="mustard">, except</span> when used as markup delimiters, or | 297 | maintained purely for backward compatibility with the |
298 | within a <a title="Comment" href="#dt-comment">comment</a>, a <a title="Processing instruction" href="#dt-pi">processing | 298 | <a href="http://www.w3.org/TR/1998/REC-xml-19980210">First Edition</a>. |
299 | instruction</a>, or a <a title="CDATA Section" href="#dt-cdsection">CDATA section</a>. | 299 | As explained in <a href="#sec-line-ends"><b>2.11 End-of-Line Handling</b></a>, |
300 | 300 | all #xD characters literally present in an XML document | |
301 | If they are needed elsewhere, they <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> be <a title="escape" href="#dt-escape">escaped</a> | 301 | are either removed or replaced by #xA characters before |
302 | using either <a title="Character Reference" href="#dt-charref">numeric character references</a> | 302 | any other processing is done. The only way to get a #xD character to match this production is to |
303 | or the strings "<code>&amp;</code>" and "<code>&lt;</code>" | 303 | use a character reference in an entity value literal.</p></div><p> |
304 | respectively. The right angle bracket (>) <em class="rfc2119" title="Keyword in RFC 2119 context">MAY</em> be represented using the string "<code>&gt;</code>", | 304 | [<a name="dt-name" id="dt-name" title="Name">Definition</a>: A <b>Name</b> is a token beginning |
305 | and <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em>, <a title="For Compatibility" href="#dt-compat">for compatibility</a>, be escaped | 305 | with a letter or one of a few punctuation characters, and continuing with |
306 | using <span>either</span> "<code>&gt;</code>" or a character reference when it | 306 | letters, digits, hyphens, underscores, colons, or full stops, together known |
307 | appears in the string "<code>]]></code>" in content, when | 307 | as name characters.] Names beginning with the string "<code>xml</code>", |
308 | that string is not marking the end of a <a title="CDATA Section" href="#dt-cdsection">CDATA | 308 | or with any string which would match <code>(('X'|'x') ('M'|'m') ('L'|'l'))</code>, |
309 | section</a>.</p><p>In the content of elements, character data is any string of characters | 309 | are reserved for standardization in this or future versions of this specification.</p><div class="note"><p class="prefix"><b>Note:</b></p><p>The |
310 | which does not contain the start-delimiter of any markup or the | 310 | Namespaces in XML Recommendation <a href="#xml-names">[XML Names]</a> assigns a meaning |
311 | CDATA-section-close delimiter, | 311 | to names containing colon characters. Therefore, authors should not use the |
312 | "<code>]]></code>". | 312 | colon in XML names except for namespace purposes, but XML processors must |
313 | In a CDATA section, | 313 | accept the colon as a name character.</p></div><p>An <a href="#NT-Nmtoken">Nmtoken</a> (name token) is any mixture of name |
314 | character data is any string of characters not including the CDATA-section-close | 314 | characters.</p><p>The first character of a Name <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> be a NameStartChar, and any |
315 | delimiter.</p><p>To allow attribute values to contain both single and double quotes, the | 315 | other characters <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> be NameChars; this mechanism is used to |
316 | apostrophe or single-quote character (') <em class="rfc2119" title="Keyword in RFC 2119 context">MAY</em> be represented as "<code>&apos;</code>", | 316 | prevent names from beginning with European (ASCII) digits or with |
317 | and the double-quote character (") as "<code>&quot;</code>".</p> <h5><a name="IDASZ1S" id="IDASZ1S" />Character Data</h5><table class="scrap" summary="Scrap"><tbody><tr valign="baseline"><td><a name="NT-CharData" id="NT-CharData" />[14] </td><td><code>CharData</code></td><td> ::= </td><td><code>[^<&]* - ([^<&]* ']]>' [^<&]*)</code></td></tr></tbody></table></div><div class="div2"> <h3><a name="sec-comments" id="sec-comments" />2.5 Comments</h3><p>[<a name="dt-comment" id="dt-comment" title="Comment">Definition</a>: <b>Comments</b> <em class="rfc2119" title="Keyword in RFC 2119 context">MAY</em> appear | 317 | basic combining characters. Almost all characters are permitted in |
318 | anywhere in a document outside other <a title="Markup" href="#dt-markup">markup</a>; | 318 | names, except those which either are or reasonably could be used as |
319 | in addition, they <em class="rfc2119" title="Keyword in RFC 2119 context">MAY</em> appear within the document type declaration at places | 319 | delimiters. The intention is to be inclusive rather than exclusive, |
320 | allowed by the grammar. They are not part of the document's <a title="Character Data" href="#dt-chardata">character | 320 | so that writing systems not yet encoded in Unicode can be used in |
321 | data</a>; an XML processor <em class="rfc2119" title="Keyword in RFC 2119 context">MAY</em>, but need not, make it possible for an | 321 | XML names. See <a href="#sec-suggested-names"><b>I Suggestions for XML Names</b></a> for suggestions on the creation of |
322 | application to retrieve the text of comments. <a title="For Compatibility" href="#dt-compat">For | 322 | names.</p><p>Document authors are encouraged to use names which are |
323 | compatibility</a>, the string "<code>--</code>" (double-hyphen) | 323 | meaningful words or combinations of words in natural languages, and |
324 | <em class="rfc2119" title="Keyword in RFC 2119 context">MUST NOT</em> occur within comments.] Parameter | 324 | to avoid symbolic or white space characters in names. Note that |
325 | entity references <span class="mustard"><em class="rfc2119" title="Keyword in RFC 2119 context">MUST NOT</em> be</span> recognized within comments.</p> <h5><a name="IDAL11S" id="IDAL11S" />Comments</h5><table class="scrap" summary="Scrap"><tbody><tr valign="baseline"><td><a name="NT-Comment" id="NT-Comment" />[15] </td><td><code>Comment</code></td><td> ::= </td><td><code>'<!--' ((<a href="#NT-Char">Char</a> - '-') | ('-' | 325 | COLON, HYPHEN-MINUS, FULL STOP (period), LOW LINE (underscore), and |
326 | (<a href="#NT-Char">Char</a> - '-')))* '-->'</code></td></tr></tbody></table><p>An example of a comment:</p><div class="exampleInner"><pre><!-- declarations for <head> & <body> --></pre></div><p>Note | 326 | MIDDLE DOT are explicitly permitted.</p><p>The ASCII symbols and punctuation marks, along with a fairly |
327 | that the grammar does not allow a comment ending in <code>---></code>. The | 327 | large group of Unicode symbol characters, are excluded from names |
328 | following example is <em>not</em> well-formed.</p><div class="exampleInner"><pre><!-- B+, B, or B---></pre></div></div><div class="div2"> <h3><a name="sec-pi" id="sec-pi" />2.6 Processing Instructions</h3><p>[<a name="dt-pi" id="dt-pi" title="Processing instruction">Definition</a>: <b>Processing instructions</b> | 328 | because they are more useful as delimiters in contexts where XML |
329 | (PIs) allow documents to contain instructions for applications.]</p> <h5><a name="IDAD31S" id="IDAD31S" />Processing Instructions</h5><table class="scrap" summary="Scrap"><tbody><tr valign="baseline"><td><a name="NT-PI" id="NT-PI" />[16] </td><td><code>PI</code></td><td> ::= </td><td><code>'<?' <a href="#NT-PITarget">PITarget</a> (<a href="#NT-S">S</a> | 329 | names are used outside XML documents; providing this group gives |
330 | (<a href="#NT-Char">Char</a>* - (<a href="#NT-Char">Char</a>* '?>' <a href="#NT-Char">Char</a>*)))? '?>'</code></td></tr></tbody><tbody><tr valign="baseline"><td><a name="NT-PITarget" id="NT-PITarget" />[17] </td><td><code>PITarget</code></td><td> ::= </td><td><code><a href="#NT-Name">Name</a> - (('X' | 'x') ('M' | | 330 | those contexts hard guarantees about what <em>cannot</em> be part of |
331 | 'm') ('L' | 'l'))</code></td></tr></tbody></table><p>PIs are not part of the document's <a title="Character Data" href="#dt-chardata">character | 331 | an XML name. The character #x037E, GREEK QUESTION MARK, is excluded |
332 | data</a>, but <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> be passed through to the application. The PI begins | 332 | because when normalized it becomes a semicolon, which could change |
333 | with a target (<a href="#NT-PITarget">PITarget</a>) used to identify the application | 333 | the meaning of entity references.</p> <h5><a name="IDAKUDS" id="IDAKUDS"/>Names and Tokens</h5><table class="scrap" summary="Scrap"><tbody><tr valign="baseline"><td><a name="NT-NameStartChar" id="NT-NameStartChar"/>[4] </td><td><code>NameStartChar</code></td><td> ::= </td><td><code>":" | [A-Z] | "_" | [a-z] | [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF] | [#x370-#x37D] | [#x37F-#x1FFF] | [#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF] | [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] | [#x10000-#xEFFFF]</code></td></tr></tbody><tbody><tr valign="baseline"><td><a name="NT-NameChar" id="NT-NameChar"/>[4a] </td><td><code>NameChar</code></td><td> ::= </td><td><code> |
334 | to which the instruction is directed. The target names "<code>XML</code>", "<code>xml</code>", | 334 | <a href="#NT-NameStartChar">NameStartChar</a> | "-" | "." | [0-9] | #xB7 | [#x0300-#x036F] | [#x203F-#x2040]</code></td></tr></tbody><tbody><tr valign="baseline"><td><a name="NT-Name" id="NT-Name"/>[5] </td><td><code>Name</code></td><td> ::= </td><td><code> |
335 | and so on are reserved for standardization in this or future versions of this | 335 | <a href="#NT-NameStartChar">NameStartChar</a> (<a href="#NT-NameChar">NameChar</a>)*</code></td></tr></tbody><tbody><tr valign="baseline"><td><a name="NT-Names" id="NT-Names"/>[6] </td><td><code>Names</code></td><td> ::= </td><td><code> |
336 | specification. The XML <a title="Notation" href="#dt-notation">Notation</a> mechanism | 336 | <a href="#NT-Name">Name</a> (#x20 <a href="#NT-Name">Name</a>)*</code></td></tr></tbody><tbody><tr valign="baseline"><td><a name="NT-Nmtoken" id="NT-Nmtoken"/>[7] </td><td><code>Nmtoken</code></td><td> ::= </td><td><code>(<a href="#NT-NameChar">NameChar</a>)+</code></td></tr></tbody><tbody><tr valign="baseline"><td><a name="NT-Nmtokens" id="NT-Nmtokens"/>[8] </td><td><code>Nmtokens</code></td><td> ::= </td><td><code> |
337 | <em class="rfc2119" title="Keyword in RFC 2119 context">MAY</em> be used for formal declaration of PI targets. Parameter | 337 | <a href="#NT-Nmtoken">Nmtoken</a> (#x20 <a href="#NT-Nmtoken">Nmtoken</a>)*</code></td></tr></tbody></table><div class="note"><p class="prefix"><b>Note:</b></p><p>The <a href="#NT-Names">Names</a> |
338 | entity references <span class="mustard"><em class="rfc2119" title="Keyword in RFC 2119 context">MUST NOT</em> be</span> recognized within processing instructions.</p></div><div class="div2"> <h3><a name="sec-cdata-sect" id="sec-cdata-sect" />2.7 CDATA Sections</h3><p>[<a name="dt-cdsection" id="dt-cdsection" title="CDATA Section">Definition</a>: <b>CDATA sections</b> <em class="rfc2119" title="Keyword in RFC 2119 context">MAY</em> occur anywhere character data may occur; they are used to escape blocks | 338 | and <a href="#NT-Nmtokens">Nmtokens</a> productions are used to define the validity |
339 | of text containing characters which would otherwise be recognized as markup. | 339 | of tokenized attribute values after normalization (see <a href="#sec-attribute-types"><b>3.3.1 Attribute Types</b></a>).</p></div><p>Literal data is any quoted string not containing the quotation mark used |
340 | CDATA sections begin with the string "<code><![CDATA[</code>" | 340 | as a delimiter for that string. Literals are used for specifying the content |
341 | and end with the string "<code>]]></code>":]</p> <h5><a name="IDAOA2S" id="IDAOA2S" />CDATA Sections</h5><table class="scrap" summary="Scrap"><tbody><tr valign="baseline"><td><a name="NT-CDSect" id="NT-CDSect" />[18] </td><td><code>CDSect</code></td><td> ::= </td><td><code><a href="#NT-CDStart">CDStart</a> <a href="#NT-CData">CData</a> <a href="#NT-CDEnd">CDEnd</a></code></td></tr></tbody><tbody><tr valign="baseline"><td><a name="NT-CDStart" id="NT-CDStart" />[19] </td><td><code>CDStart</code></td><td> ::= </td><td><code>'<![CDATA['</code></td></tr></tbody><tbody><tr valign="baseline"><td><a name="NT-CData" id="NT-CData" />[20] </td><td><code>CData</code></td><td> ::= </td><td><code>(<a href="#NT-Char">Char</a>* - (<a href="#NT-Char">Char</a>* | 341 | of internal entities (<a href="#NT-EntityValue">EntityValue</a>), the values |
342 | ']]>' <a href="#NT-Char">Char</a>*)) </code></td></tr></tbody><tbody><tr valign="baseline"><td><a name="NT-CDEnd" id="NT-CDEnd" />[21] </td><td><code>CDEnd</code></td><td> ::= </td><td><code>']]>'</code></td></tr></tbody></table><p>Within a CDATA section, only the <a href="#NT-CDEnd">CDEnd</a> string is | 342 | of attributes (<a href="#NT-AttValue">AttValue</a>), and external identifiers |
343 | recognized as markup, so that left angle brackets and ampersands may occur | 343 | (<a href="#NT-SystemLiteral">SystemLiteral</a>). Note that a <a href="#NT-SystemLiteral">SystemLiteral</a> |
344 | in their literal form; they need not (and cannot) be escaped using "<code>&lt;</code>" | 344 | can be parsed without scanning for markup.</p> <h5><a name="IDAKYDS" id="IDAKYDS"/>Literals</h5><table class="scrap" summary="Scrap"><tbody><tr valign="baseline"><td><a name="NT-EntityValue" id="NT-EntityValue"/>[9] </td><td><code>EntityValue</code></td><td> ::= </td><td><code>'"' ([^%&"] | <a href="#NT-PEReference">PEReference</a> |
345 | and "<code>&amp;</code>". CDATA sections cannot nest.</p><p>An example of a CDATA section, in which "<code><greeting></code>" | 345 | | <a href="#NT-Reference">Reference</a>)* '"' </code></td></tr><tr valign="baseline"><td/><td/><td/><td><code>| "'" ([^%&'] | <a href="#NT-PEReference">PEReference</a> | <a href="#NT-Reference">Reference</a>)* "'"</code></td></tr></tbody><tbody><tr valign="baseline"><td><a name="NT-AttValue" id="NT-AttValue"/>[10] </td><td><code>AttValue</code></td><td> ::= </td><td><code>'"' ([^<&"] | <a href="#NT-Reference">Reference</a>)* |
346 | and "<code></greeting></code>" are recognized as <a title="Character Data" href="#dt-chardata">character data</a>, not <a title="Markup" href="#dt-markup">markup</a>:</p><div class="exampleInner"><pre><![CDATA[<greeting>Hello, world!</greeting>]]> </pre></div></div><div class="div2"> <h3><a name="sec-prolog-dtd" id="sec-prolog-dtd" />2.8 Prolog and Document Type Declaration</h3><p>[<a name="dt-xmldecl" id="dt-xmldecl" title="XML Declaration">Definition</a>: XML 1.1 documents <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> | 346 | '"' </code></td></tr><tr valign="baseline"><td/><td/><td/><td><code>| "'" ([^<&'] | <a href="#NT-Reference">Reference</a>)* |
347 | begin with an <b>XML declaration</b> which specifies the version of | 347 | "'"</code></td></tr></tbody><tbody><tr valign="baseline"><td><a name="NT-SystemLiteral" id="NT-SystemLiteral"/>[11] </td><td><code>SystemLiteral</code></td><td> ::= </td><td><code>('"' [^"]* '"') | ("'" [^']* "'") </code></td></tr></tbody><tbody><tr valign="baseline"><td><a name="NT-PubidLiteral" id="NT-PubidLiteral"/>[12] </td><td><code>PubidLiteral</code></td><td> ::= </td><td><code>'"' <a href="#NT-PubidChar">PubidChar</a>* '"' |
348 | XML being used.] For example, the following is a complete XML 1.1 document, <a title="Well-Formed" href="#dt-wellformed">well-formed</a> but not <a title="Validity" href="#dt-valid">valid</a>:</p><div class="exampleInner"><pre><?xml version="1.1"?> | 348 | | "'" (<a href="#NT-PubidChar">PubidChar</a> - "'")* "'"</code></td></tr></tbody><tbody><tr valign="baseline"><td><a name="NT-PubidChar" id="NT-PubidChar"/>[13] </td><td><code>PubidChar</code></td><td> ::= </td><td><code>#x20 | #xD | #xA | [a-zA-Z0-9] | [-'()+,./:=?;!*#@$_%]</code></td></tr></tbody></table><div class="note"><p class="prefix"><b>Note:</b></p><p>Although |
349 | <greeting>Hello, world!</greeting> </pre></div><p>but the following is an XML 1.0 document because it | 349 | the <a href="#NT-EntityValue">EntityValue</a> production allows the definition |
350 | does not have an XML declaration:</p><div class="exampleInner"><pre><greeting>Hello, world!</greeting></pre></div><p>The function of the markup in an XML document is to describe its storage and | 350 | of a general entity consisting of a single explicit <code><</code> in the literal |
351 | logical structure and to associate <span>attribute | 351 | (e.g., <code><!ENTITY mylt "<"></code>), it is strongly advised to avoid |
352 | name-value</span> pairs with its logical structures. XML provides a mechanism, the | 352 | this practice since any reference to that entity will cause a well-formedness |
353 | <a title="Document Type Declaration" href="#dt-doctype">document | 353 | error.</p></div></div><div class="div2"> <h3><a name="syntax" id="syntax"/>2.4 Character Data and Markup</h3><p> |
354 | type declaration</a>, to define constraints on the logical structure | 354 | <a title="Text" href="#dt-text">Text</a> consists of intermingled <a title="Character Data" href="#dt-chardata">character data</a> and markup. [<a name="dt-markup" id="dt-markup" title="Markup">Definition</a>: |
355 | and to support the use of predefined storage units. [<a name="dt-valid" id="dt-valid" title="Validity">Definition</a>: An XML document is <b>valid</b> if it has an associated | 355 | <b>Markup</b> takes the form of <a title="Start-Tag" href="#dt-stag">start-tags</a>, <a title="End Tag" href="#dt-etag">end-tags</a>, <a title="Empty" href="#dt-empty">empty-element tags</a>, <a title="Entity Reference" href="#dt-entref">entity references</a>, <a title="Character Reference" href="#dt-charref">character |
356 | document type declaration and if the document complies with the constraints | 356 | references</a>, <a title="Comment" href="#dt-comment">comments</a>, <a title="CDATA Section" href="#dt-cdsection">CDATA section</a> delimiters, <a title="Document Type Declaration" href="#dt-doctype">document |
357 | expressed in it.]</p><p>The document type declaration <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> appear before the first <a title="Element" href="#dt-element">element</a> | 357 | type declarations</a>, <a title="Processing instruction" href="#dt-pi">processing instructions</a>, <a href="#NT-XMLDecl">XML declarations</a>, <a href="#NT-TextDecl">text declarations</a>, |
358 | in the document.</p> <h5><a name="xmldoc" id="xmldoc" />Prolog</h5><table class="scrap" summary="Scrap"><tbody><tr valign="baseline"><td><a name="NT-prolog" id="NT-prolog" />[22] </td><td><code>prolog</code></td><td> ::= </td><td><code><a href="#NT-XMLDecl">XMLDecl</a> <a href="#NT-Misc">Misc</a>* | 358 | and any white space that is at the top level of the document entity (that |
359 | (<a href="#NT-doctypedecl">doctypedecl</a> <a href="#NT-Misc">Misc</a>*)?</code></td></tr><tr valign="baseline"><td><a name="NT-XMLDecl" id="NT-XMLDecl" />[23] </td><td><code>XMLDecl</code></td><td> ::= </td><td><code>'<?xml' <a href="#NT-VersionInfo">VersionInfo</a> <a href="#NT-EncodingDecl">EncodingDecl</a>? <a href="#NT-SDDecl">SDDecl</a>? <a href="#NT-S">S</a>?'?>'</code></td></tr><tr valign="baseline"><td><a name="NT-VersionInfo" id="NT-VersionInfo" />[24] </td><td><code>VersionInfo</code></td><td> ::= </td><td><code><a href="#NT-S">S</a> 'version' <a href="#NT-Eq">Eq</a> | 359 | is, outside the document element and not inside any other markup).] |
360 | ("'" <a href="#NT-VersionNum">VersionNum</a> "'" | '"' <a href="#NT-VersionNum">VersionNum</a> | 360 | </p><p> |
361 | '"')</code></td></tr><tr valign="baseline"><td><a name="NT-Eq" id="NT-Eq" />[25] </td><td><code>Eq</code></td><td> ::= </td><td><code><a href="#NT-S">S</a>? '=' <a href="#NT-S">S</a>?</code></td></tr><tr valign="baseline"><td><a name="NT-VersionNum" id="NT-VersionNum" />[26] </td><td><code>VersionNum</code></td><td> ::= </td><td><code>'1.1'</code></td></tr><tr valign="baseline"><td><a name="NT-Misc" id="NT-Misc" />[27] </td><td><code>Misc</code></td><td> ::= </td><td><code><a href="#NT-Comment">Comment</a> | <a href="#NT-PI">PI</a> | 361 | [<a name="dt-chardata" id="dt-chardata" title="Character Data">Definition</a>: All text that is not markup |
362 | | <a href="#NT-S">S</a></code></td></tr></tbody></table><p>[<a name="dt-doctype" id="dt-doctype" title="Document Type Declaration">Definition</a>: The XML <b>document | 362 | constitutes the <b>character data</b> of the document.] |
363 | type declaration</b> contains or points to <a title="markup declaration" href="#dt-markupdecl">markup | 363 | </p><p>The ampersand character (&) and the left angle bracket (<) <em class="rfc2119" title="Keyword in RFC 2119 context">MUST NOT</em> appear |
364 | declarations</a> that provide a grammar for a class of documents. This | 364 | in their literal form, except when used as markup delimiters, or |
365 | grammar is known as a document type definition, or <b>DTD</b>. The document | 365 | within a <a title="Comment" href="#dt-comment">comment</a>, a <a title="Processing instruction" href="#dt-pi">processing |
366 | type declaration can point to an external subset (a special kind of <a title="External Entity" href="#dt-extent">external entity</a>) containing markup declarations, | 366 | instruction</a>, or a <a title="CDATA Section" href="#dt-cdsection">CDATA section</a>. |
367 | or can contain the markup declarations directly in an internal subset, or | 367 | |
368 | can do both. The DTD for a document consists of both subsets taken together.]</p><p>[<a name="dt-markupdecl" id="dt-markupdecl" title="markup declaration">Definition</a>: A <b>markup declaration</b> | 368 | If they are needed elsewhere, they <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> be <a title="escape" href="#dt-escape">escaped</a> |
369 | is an <a title="Element Type declaration" href="#dt-eldecl">element type declaration</a>, an <a title="Attribute-List Declaration" href="#dt-attdecl">attribute-list declaration</a>, an <a title="entity declaration" href="#dt-entdecl">entity | 369 | using either <a title="Character Reference" href="#dt-charref">numeric character references</a> |
370 | declaration</a>, or a <a title="Notation Declaration" href="#dt-notdecl">notation declaration</a>.] | 370 | or the strings "<code>&amp;</code>" and "<code>&lt;</code>" |
371 | These declarations <em class="rfc2119" title="Keyword in RFC 2119 context">MAY</em> be contained in whole or in part within <a title="Parameter entity" href="#dt-PE">parameter | 371 | respectively. The right angle bracket (>) <span>may</span> be represented using the string "<code>&gt;</code>", |
372 | entities</a>, as described in the well-formedness and validity constraints | 372 | and <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em>, <a title="For Compatibility" href="#dt-compat">for compatibility</a>, be escaped |
373 | below. For further | 373 | using either "<code>&gt;</code>" or a character reference when it |
374 | information, see <a href="#sec-physical-struct"><b>4 Physical Structures</b></a>.</p> <h5><a name="dtd" id="dtd" />Document Type Definition</h5><table class="scrap" summary="Scrap"><tbody><tr valign="baseline"><td><a name="NT-doctypedecl" id="NT-doctypedecl" />[28] </td><td><code>doctypedecl</code></td><td> ::= </td><td><code>'<!DOCTYPE' <a href="#NT-S">S</a> <a href="#NT-Name">Name</a> | 374 | appears in the string "<code>]]></code>" in content, when |
375 | (<a href="#NT-S">S</a> <a href="#NT-ExternalID">ExternalID</a>)? <a href="#NT-S">S</a>? | 375 | that string is not marking the end of a <a title="CDATA Section" href="#dt-cdsection">CDATA |
376 | ('[' <a href="#NT-intSubset">intSubset</a> ']' <a href="#NT-S">S</a>?)? '>'</code></td><td><a href="#vc-roottype">[VC: Root Element Type]</a></td></tr><tr valign="baseline"><td /><td /><td /><td /><td><a href="#ExtSubset">[WFC: External Subset]</a></td></tr><tr valign="baseline"><td><a name="NT-DeclSep" id="NT-DeclSep" />[28a] </td><td><code>DeclSep</code></td><td> ::= </td><td><code><a href="#NT-PEReference">PEReference</a> | <a href="#NT-S">S</a></code></td><td><a href="#PE-between-Decls">[WFC: PE Between Declarations]</a></td></tr><tr valign="baseline"><td><a name="NT-intSubset" id="NT-intSubset" />[28b] </td><td><code>intSubset</code></td><td> ::= </td><td><code>(<a href="#NT-markupdecl">markupdecl</a> | <a href="#NT-DeclSep">DeclSep</a>)*</code></td></tr><tr valign="baseline"><td><a name="NT-markupdecl" id="NT-markupdecl" />[29] </td><td><code>markupdecl</code></td><td> ::= </td><td><code><a href="#NT-elementdecl">elementdecl</a> | <a href="#NT-AttlistDecl">AttlistDecl</a> | <a href="#NT-EntityDecl">EntityDecl</a> | 376 | section</a>.</p><p>In the content of elements, character data is any string of characters |
377 | | <a href="#NT-NotationDecl">NotationDecl</a> | <a href="#NT-PI">PI</a> | <a href="#NT-Comment">Comment</a></code></td><td><a href="#vc-PEinMarkupDecl">[VC: Proper Declaration/PE Nesting]</a></td></tr><tr valign="baseline"><td /><td /><td /><td /><td><a href="#wfc-PEinInternalSubset">[WFC: PEs in Internal Subset]</a></td></tr></tbody></table><p>Note | 377 | which does not contain the start-delimiter of any markup or the |
378 | that it is possible to construct a well-formed document containing a <a href="#NT-doctypedecl">doctypedecl</a> | 378 | CDATA-section-close delimiter, |
379 | that neither points to an external subset nor contains an internal subset.</p><p>The markup declarations <em class="rfc2119" title="Keyword in RFC 2119 context">MAY</em> be made up in whole or in part of the <a title="Replacement Text" href="#dt-repltext">replacement text</a> of <a title="Parameter entity" href="#dt-PE">parameter | 379 | "<code>]]></code>". |
380 | entities</a>. The productions later in this specification for individual | 380 | In a CDATA section, |
381 | nonterminals (<a href="#NT-elementdecl">elementdecl</a>, <a href="#NT-AttlistDecl">AttlistDecl</a>, | 381 | character data is any string of characters not including the CDATA-section-close |
382 | and so on) describe the declarations <em>after</em> all the parameter | 382 | delimiter.</p><p>To allow attribute values to contain both single and double quotes, the |
383 | entities have been <a title="Include" href="#dt-include">included</a>.</p><p>Parameter | 383 | apostrophe or single-quote character (') <span>may</span> be represented as "<code>&apos;</code>", |
384 | entity references are recognized anywhere in the DTD (internal and external | 384 | and the double-quote character (") as "<code>&quot;</code>".</p> <h5><a name="IDAABES" id="IDAABES"/>Character Data</h5><table class="scrap" summary="Scrap"><tbody><tr valign="baseline"><td><a name="NT-CharData" id="NT-CharData"/>[14] </td><td><code>CharData</code></td><td> ::= </td><td><code>[^<&]* - ([^<&]* ']]>' [^<&]*)</code></td></tr></tbody></table></div><div class="div2"> <h3><a name="sec-comments" id="sec-comments"/>2.5 Comments</h3><p> |
385 | subsets and external parameter entities), except in literals, processing instructions, | 385 | [<a name="dt-comment" id="dt-comment" title="Comment">Definition</a>: |
386 | comments, and the contents of ignored conditional sections (see <a href="#sec-condition-sect"><b>3.4 Conditional Sections</b></a>). | 386 | <b>Comments</b> <span>may</span> appear |
387 | They are also recognized in entity value literals. The use of parameter entities | 387 | anywhere in a document outside other <a title="Markup" href="#dt-markup">markup</a>; |
388 | in the internal subset is restricted as described below.</p><div class="constraint"><p class="prefix"><a name="vc-roottype" id="vc-roottype" /><b>Validity constraint: Root Element Type</b></p><p>The <a href="#NT-Name">Name</a> | 388 | in addition, they <span>may</span> appear within the document type declaration at places |
389 | in the document type declaration <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> match the element type of the <a title="Root Element" href="#dt-root">root element</a>.</p></div><div class="constraint"><p class="prefix"><a name="vc-PEinMarkupDecl" id="vc-PEinMarkupDecl" /><b>Validity constraint: Proper Declaration/PE Nesting</b></p><p>Parameter-entity <a title="Replacement Text" href="#dt-repltext">replacement text</a> <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> be properly nested with markup declarations. That is to say, if either | 389 | allowed by the grammar. They are not part of the document's <a title="Character Data" href="#dt-chardata">character |
390 | the first character or the last character of a markup declaration (<a href="#NT-markupdecl">markupdecl</a> | 390 | data</a>; an XML processor <em class="rfc2119" title="Keyword in RFC 2119 context">MAY</em>, but need not, make it possible for an |
391 | above) is contained in the replacement text for a <a title="Parameter-entity reference" href="#dt-PERef">parameter-entity | 391 | application to retrieve the text of comments. <a title="For Compatibility" href="#dt-compat">For |
392 | reference</a>, both <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> be contained in the same replacement text.</p></div><div class="constraint"><p class="prefix"><a name="wfc-PEinInternalSubset" id="wfc-PEinInternalSubset" /><b>Well-formedness constraint: PEs in Internal Subset</b></p><p>In | 392 | compatibility</a>, the string "<code>--</code>" (double-hyphen) |
393 | the internal DTD subset, <a title="Parameter-entity reference" href="#dt-PERef">parameter-entity references</a> <span class="mustard"><em class="rfc2119" title="Keyword in RFC 2119 context">MUST NOT</em> occur within markup declarations; they <em class="rfc2119" title="Keyword in RFC 2119 context">MAY</em> occur where markup declarations can occur</span>. | 393 | <em class="rfc2119" title="Keyword in RFC 2119 context">MUST NOT</em> occur within comments.] Parameter |
394 | (This does not apply to references that occur in external parameter entities | 394 | entity references <em class="rfc2119" title="Keyword in RFC 2119 context">MUST NOT</em> be recognized within comments.</p> <h5><a name="IDA5CES" id="IDA5CES"/>Comments</h5><table class="scrap" summary="Scrap"><tbody><tr valign="baseline"><td><a name="NT-Comment" id="NT-Comment"/>[15] </td><td><code>Comment</code></td><td> ::= </td><td><code>'<!--' ((<a href="#NT-Char">Char</a> - '-') | ('-' |
395 | or to the external subset.)</p></div><div class="constraint"><p class="prefix"><a name="ExtSubset" id="ExtSubset" /><b>Well-formedness constraint: External Subset</b></p><p>The external subset, if any, <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> match the production for <a href="#NT-extSubset">extSubset</a>.</p></div><div class="constraint"><p class="prefix"><a name="PE-between-Decls" id="PE-between-Decls" /><b>Well-formedness constraint: PE Between Declarations</b></p><p>The replacement text of a parameter entity reference | 395 | (<a href="#NT-Char">Char</a> - '-')))* '-->'</code></td></tr></tbody></table><p>An example of a comment:</p><div class="exampleInner"><pre><!-- declarations for <head> & <body> --></pre></div><p>Note |
396 | in a <a href="#NT-DeclSep">DeclSep</a> <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> match the production <a href="#NT-extSubsetDecl">extSubsetDecl</a>.</p></div><p>Like the internal subset, the external subset and any external parameter | 396 | that the grammar does not allow a comment ending in <code>---></code>. The |
397 | entities referenced | 397 | following example is <em>not</em> well-formed.</p><div class="exampleInner"><pre><!-- B+, B, or B---></pre></div></div><div class="div2"> <h3><a name="sec-pi" id="sec-pi"/>2.6 Processing Instructions</h3><p> |
398 | in a <a href="#NT-DeclSep">DeclSep</a> <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> consist of a series of | 398 | [<a name="dt-pi" id="dt-pi" title="Processing instruction">Definition</a>: |
399 | complete markup declarations of the types allowed by the non-terminal symbol <a href="#NT-markupdecl">markupdecl</a>, interspersed with white space or <a title="Parameter-entity reference" href="#dt-PERef">parameter-entity references</a>. However, portions of | 399 | <b>Processing instructions</b> |
400 | the contents of the external subset or of these | 400 | (PIs) allow documents to contain instructions for applications.] |
401 | external parameter entities <em class="rfc2119" title="Keyword in RFC 2119 context">MAY</em> conditionally be ignored by using the <a title="conditional section" href="#dt-cond-section">conditional section</a> construct; this is not | 401 | </p> <h5><a name="IDAREES" id="IDAREES"/>Processing Instructions</h5><table class="scrap" summary="Scrap"><tbody><tr valign="baseline"><td><a name="NT-PI" id="NT-PI"/>[16] </td><td><code>PI</code></td><td> ::= </td><td><code>'<?' <a href="#NT-PITarget">PITarget</a> (<a href="#NT-S">S</a> |
402 | allowed in the internal subset<span> but is | 402 | (<a href="#NT-Char">Char</a>* - (<a href="#NT-Char">Char</a>* '?>' <a href="#NT-Char">Char</a>*)))? '?>'</code></td></tr></tbody><tbody><tr valign="baseline"><td><a name="NT-PITarget" id="NT-PITarget"/>[17] </td><td><code>PITarget</code></td><td> ::= </td><td><code> |
403 | allowed in external parameter entities referenced in the internal subset</span>.</p> <h5><a name="ext-Subset" id="ext-Subset" />External Subset</h5><table class="scrap" summary="Scrap"><tbody><tr valign="baseline"><td><a name="NT-extSubset" id="NT-extSubset" />[30] </td><td><code>extSubset</code></td><td> ::= </td><td><code><a href="#NT-TextDecl">TextDecl</a>? <a href="#NT-extSubsetDecl">extSubsetDecl</a></code></td></tr><tr valign="baseline"><td><a name="NT-extSubsetDecl" id="NT-extSubsetDecl" />[31] </td><td><code>extSubsetDecl</code></td><td> ::= </td><td><code>( <a href="#NT-markupdecl">markupdecl</a> | <a href="#NT-conditionalSect">conditionalSect</a> | <a href="#NT-DeclSep">DeclSep</a>)*</code></td></tr></tbody></table><p>The external subset and external parameter entities also differ from the | 403 | <a href="#NT-Name">Name</a> - (('X' | 'x') ('M' | |
404 | internal subset in that in them, <a title="Parameter-entity reference" href="#dt-PERef">parameter-entity | 404 | 'm') ('L' | 'l'))</code></td></tr></tbody></table><p>PIs are not part of the document's <a title="Character Data" href="#dt-chardata">character |
405 | references</a> are permitted <em>within</em> markup declarations, | 405 | data</a>, but <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> be passed through to the application. The PI begins |
406 | not only <em>between</em> markup declarations.</p><p>An example of an XML document with a document type declaration:</p><div class="exampleInner"><pre><?xml version="1.1"?> | 406 | with a target (<a href="#NT-PITarget">PITarget</a>) used to identify the application |
407 | <!DOCTYPE greeting SYSTEM "hello.dtd"> | 407 | to which the instruction is directed. The target names "<code>XML</code>", "<code>xml</code>", |
408 | <greeting>Hello, world!</greeting> </pre></div><p>The <a title="System Identifier" href="#dt-sysid">system identifier</a> "<code>hello.dtd</code>" | 408 | and so on are reserved for standardization in this or future versions of this |
409 | gives the address (a URI reference) of a DTD for the document.</p><p>The declarations can also be given locally, as in this example:</p><div class="exampleInner"><pre><?xml version="1.1" encoding="UTF-8" ?> | 409 | specification. The XML <a title="Notation" href="#dt-notation">Notation</a> mechanism |
410 | <!DOCTYPE greeting [ | 410 | <span>may</span> be used for formal declaration of PI targets. Parameter |
411 | <!ELEMENT greeting (#PCDATA)> | 411 | entity references <em class="rfc2119" title="Keyword in RFC 2119 context">MUST NOT</em> be recognized within processing instructions.</p></div><div class="div2"> <h3><a name="sec-cdata-sect" id="sec-cdata-sect"/>2.7 CDATA Sections</h3><p> |
412 | ]> | 412 | [<a name="dt-cdsection" id="dt-cdsection" title="CDATA Section">Definition</a>: |
413 | <greeting>Hello, world!</greeting></pre></div><p>If both the external and internal subsets are used, the internal subset | 413 | <b>CDATA sections</b> <span>may</span> occur anywhere character data may occur; they are used to escape blocks |
414 | <span class="mustard"><em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> be</span> considered to occur before the external subset. | 414 | of text containing characters which would otherwise be recognized as markup. |
415 | This has the effect that entity and attribute-list declarations in the internal | 415 | CDATA sections begin with the string "<code><![CDATA[</code>" |
416 | subset take precedence over those in the external subset.</p><p>XML 1.1 processors <em class="rfc2119" title="Keyword in RFC 2119 context">SHOULD</em> accept XML 1.0 | 416 | and end with the string "<code>]]></code>":] |
417 | documents as well. If a document is well-formed or valid XML 1.0, and provided it | 417 | </p> <h5><a name="IDA2HES" id="IDA2HES"/>CDATA Sections</h5><table class="scrap" summary="Scrap"><tbody><tr valign="baseline"><td><a name="NT-CDSect" id="NT-CDSect"/>[18] </td><td><code>CDSect</code></td><td> ::= </td><td><code> |
418 | does not contain any control characters | 418 | <a href="#NT-CDStart">CDStart</a> |
419 | in the range [#x7F-#x9F] other than as character escapes, it may be | 419 | <a href="#NT-CData">CData</a> |
420 | made well-formed or valid XML 1.1 respectively simply by changing the | 420 | <a href="#NT-CDEnd">CDEnd</a> |
421 | version number.</p></div><div class="div2"> <h3><a name="sec-rmd" id="sec-rmd" />2.9 Standalone Document Declaration</h3><p>Markup declarations can affect the content of the document, as passed from | 421 | </code></td></tr></tbody><tbody><tr valign="baseline"><td><a name="NT-CDStart" id="NT-CDStart"/>[19] </td><td><code>CDStart</code></td><td> ::= </td><td><code>'<![CDATA['</code></td></tr></tbody><tbody><tr valign="baseline"><td><a name="NT-CData" id="NT-CData"/>[20] </td><td><code>CData</code></td><td> ::= </td><td><code>(<a href="#NT-Char">Char</a>* - (<a href="#NT-Char">Char</a>* |
422 | an <a title="XML Processor" href="#dt-xml-proc">XML processor</a> to an application; examples | 422 | ']]>' <a href="#NT-Char">Char</a>*)) </code></td></tr></tbody><tbody><tr valign="baseline"><td><a name="NT-CDEnd" id="NT-CDEnd"/>[21] </td><td><code>CDEnd</code></td><td> ::= </td><td><code>']]>'</code></td></tr></tbody></table><p>Within a CDATA section, only the <a href="#NT-CDEnd">CDEnd</a> string is |
423 | are attribute defaults and entity declarations. The standalone document declaration, | 423 | recognized as markup, so that left angle brackets and ampersands may occur |
424 | which <em class="rfc2119" title="Keyword in RFC 2119 context">MAY</em> appear as a component of the XML declaration, signals whether or | 424 | in their literal form; they need not (and cannot) be escaped using "<code>&lt;</code>" |
425 | not there are such declarations which appear external to the <a title="Document Entity" href="#dt-docent">document | 425 | and "<code>&amp;</code>". CDATA sections cannot nest.</p><p>An example of a CDATA section, in which "<code><greeting></code>" |
426 | entity</a> | 426 | and "<code></greeting></code>" are recognized as <a title="Character Data" href="#dt-chardata">character data</a>, not <a title="Markup" href="#dt-markup">markup</a>:</p><div class="exampleInner"><pre><![CDATA[<greeting>Hello, world!</greeting>]]> </pre></div></div><div class="div2"> <h3><a name="sec-prolog-dtd" id="sec-prolog-dtd"/>2.8 Prolog and Document Type Declaration</h3><p> |
427 | or in parameter entities. [<a name="dt-extmkpdecl" id="dt-extmkpdecl" title="External Markup Declaration">Definition</a>: An <b>external | 427 | [<a name="dt-xmldecl" id="dt-xmldecl" title="XML Declaration">Definition</a>: XML 1.1 documents <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> |
428 | markup declaration</b> is defined as a markup declaration occurring in | 428 | begin with an <b>XML declaration</b> which specifies the version of |
429 | the external subset or in a parameter entity (external or internal, the latter | 429 | XML being used.] For example, the following is a complete XML 1.1 document, <a title="Well-Formed" href="#dt-wellformed">well-formed</a> but not <a title="Validity" href="#dt-valid">valid</a>:</p><div class="exampleInner"><pre><?xml version="1.1"?> |
430 | being included because non-validating processors are not required to read | 430 | <greeting>Hello, world!</greeting> </pre></div><p>but the following is an XML 1.0 document because it |
431 | them).]</p> <h5><a name="fulldtd" id="fulldtd" />Standalone Document Declaration</h5><table class="scrap" summary="Scrap"><tbody><tr valign="baseline"><td><a name="NT-SDDecl" id="NT-SDDecl" />[32] </td><td><code>SDDecl</code></td><td> ::= </td><td><code>#x20+ 'standalone' <a href="#NT-Eq">Eq</a> | 431 | does not have an XML declaration:</p><div class="exampleInner"><pre><greeting>Hello, world!</greeting></pre></div><p>The function of the markup in an XML document is to describe its storage and |
432 | (("'" ('yes' | 'no') "'") | ('"' ('yes' | 'no') '"')) </code></td><td><a href="#vc-check-rmd">[VC: Standalone Document Declaration]</a></td></tr></tbody></table><p>In a standalone document declaration, the value "yes" indicates | 432 | logical structure and to associate attribute |
433 | that there are no <a title="External Markup Declaration" href="#dt-extmkpdecl">external markup declarations</a> which | 433 | name-value pairs with its logical structures. XML provides a mechanism, the |
434 | affect the information passed from the XML processor to the application. The | 434 | <a title="Document Type Declaration" href="#dt-doctype">document |
435 | value "no" indicates that there are or may be such external | 435 | type declaration</a>, to define constraints on the logical structure |
436 | markup declarations. Note that the standalone document declaration only denotes | 436 | and to support the use of predefined storage units. [<a name="dt-valid" id="dt-valid" title="Validity">Definition</a>: An XML document is <b>valid</b> if it has an associated |
437 | the presence of external <em>declarations</em>; the presence, in a document, | 437 | document type declaration and if the document complies with the constraints |
438 | of references to external <em>entities</em>, when those entities are internally | 438 | expressed in it.] |
439 | declared, does not change its standalone status.</p><p>If there are no external markup declarations, the standalone document declaration | 439 | </p><p>The document type declaration <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> appear before the first <a title="Element" href="#dt-element">element</a> |
440 | has no meaning. If there are external markup declarations but there is no | 440 | in the document.</p> <h5><a name="xmldoc" id="xmldoc"/>Prolog</h5><table class="scrap" summary="Scrap"><tbody><tr valign="baseline"><td><a name="NT-prolog" id="NT-prolog"/>[22] </td><td><code>prolog</code></td><td> ::= </td><td><code> |
441 | standalone document declaration, the value "no" is assumed.</p><p>Any XML document for which <code>standalone="no"</code> holds can be converted | 441 | <a href="#NT-XMLDecl">XMLDecl</a> |
442 | algorithmically to a standalone document, which may be desirable for some | 442 | <a href="#NT-Misc">Misc</a>* |
443 | network delivery applications.</p><div class="constraint"><p class="prefix"><a name="vc-check-rmd" id="vc-check-rmd" /><b>Validity constraint: Standalone Document Declaration</b></p><p>The | 443 | (<a href="#NT-doctypedecl">doctypedecl</a> |
444 | standalone document declaration <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> have the value "no" if | 444 | <a href="#NT-Misc">Misc</a>*)?</code></td></tr><tr valign="baseline"><td><a name="NT-XMLDecl" id="NT-XMLDecl"/>[23] </td><td><code>XMLDecl</code></td><td> ::= </td><td><code>'<?xml' <a href="#NT-VersionInfo">VersionInfo</a> |
445 | any external markup declarations contain declarations of:</p><ul><li><p>attributes with <a title="Attribute Default" href="#dt-default">default</a> values, | 445 | <a href="#NT-EncodingDecl">EncodingDecl</a>? <a href="#NT-SDDecl">SDDecl</a>? <a href="#NT-S">S</a>? '?>'</code></td></tr><tr valign="baseline"><td><a name="NT-VersionInfo" id="NT-VersionInfo"/>[24] </td><td><code>VersionInfo</code></td><td> ::= </td><td><code> |
446 | if elements to which these attributes apply appear in the document without | 446 | <a href="#NT-S">S</a> 'version' <a href="#NT-Eq">Eq</a> |
447 | specifications of values for these attributes, or</p></li><li><p>entities (other than <code>amp</code>, | 447 | ("'" <a href="#NT-VersionNum">VersionNum</a> "'" | '"' <a href="#NT-VersionNum">VersionNum</a> |
448 | <code>lt</code>, | 448 | '"')</code></td></tr><tr valign="baseline"><td><a name="NT-Eq" id="NT-Eq"/>[25] </td><td><code>Eq</code></td><td> ::= </td><td><code> |
449 | <code>gt</code>, | 449 | <a href="#NT-S">S</a>? '=' <a href="#NT-S">S</a>?</code></td></tr><tr valign="baseline"><td><a name="NT-VersionNum" id="NT-VersionNum"/>[26] </td><td><code>VersionNum</code></td><td> ::= </td><td><code>'1.1'</code></td></tr><tr valign="baseline"><td><a name="NT-Misc" id="NT-Misc"/>[27] </td><td><code>Misc</code></td><td> ::= </td><td><code> |
450 | <code>apos</code>, | 450 | <a href="#NT-Comment">Comment</a> | <a href="#NT-PI">PI</a> |
451 | <code>quot</code>), if <a title="Entity Reference" href="#dt-entref">references</a> | 451 | | <a href="#NT-S">S</a> |
452 | to those entities appear in the document, or</p></li><li><p>attributes with | 452 | </code></td></tr></tbody></table><p> |
453 | tokenized types, where the | 453 | [<a name="dt-doctype" id="dt-doctype" title="Document Type Declaration">Definition</a>: The XML <b>document |
454 | attribute appears in the document with a value such that | 454 | type declaration</b> contains or points to <a title="markup declaration" href="#dt-markupdecl">markup |
455 | <a href="#AVNormalize"><cite>normalization</cite></a> | 455 | declarations</a> that provide a grammar for a class of documents. This |
456 | will produce a different value from that which would be produced | 456 | grammar is known as a document type definition, or <b>DTD</b>. The document |
457 | in the absence of the declaration, or</p></li><li><p>element types with <a title="Element content" href="#dt-elemcontent">element content</a>, | 457 | type declaration can point to an external subset (a special kind of <a title="External Entity" href="#dt-extent">external entity</a>) containing markup declarations, |
458 | if white space occurs directly within any instance of those types.</p></li></ul></div><p>An example XML declaration with a standalone document declaration:</p><div class="exampleInner"><pre><?xml version="1.1" standalone='yes'?></pre></div></div><div class="div2"> <h3><a name="sec-white-space" id="sec-white-space" />2.10 White Space Handling</h3><p>In editing XML documents, it is often convenient to use "white space" | 458 | or can contain the markup declarations directly in an internal subset, or |
459 | (spaces, tabs, and blank lines) | 459 | can do both. The DTD for a document consists of both subsets taken together.] |
460 | to set apart the markup for greater readability. Such white space is typically | 460 | </p><p> |
461 | not intended for inclusion in the delivered version of the document. On the | 461 | [<a name="dt-markupdecl" id="dt-markupdecl" title="markup declaration">Definition</a>: A <b>markup declaration</b> |
462 | other hand, "significant" white space that should be preserved | 462 | is an <a title="Element Type declaration" href="#dt-eldecl">element type declaration</a>, an <a title="Attribute-List Declaration" href="#dt-attdecl">attribute-list declaration</a>, an <a title="entity declaration" href="#dt-entdecl">entity |
463 | in the delivered version is common, for example in poetry and source code.</p><p>An <a title="XML Processor" href="#dt-xml-proc">XML processor</a> <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> always pass | 463 | declaration</a>, or a <a title="Notation Declaration" href="#dt-notdecl">notation declaration</a>.] |
464 | all characters in a document that are not markup through to the application. | 464 | These declarations <span>may</span> be contained in whole or in part within <a title="Parameter entity" href="#dt-PE">parameter |
465 | A <a title="Validating Processor" href="#dt-validating"> validating XML processor</a> <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> also | 465 | entities</a>, as described in the well-formedness and validity constraints |
466 | inform the application which of these characters constitute white space appearing | 466 | below. For further |
467 | in <a title="Element content" href="#dt-elemcontent">element content</a>.</p><p>A special <a title="Attribute" href="#dt-attr">attribute</a> named <code>xml:space</code> <em class="rfc2119" title="Keyword in RFC 2119 context">MAY</em> be attached to an element to signal an intention that in that element, | 467 | information, see <a href="#sec-physical-struct"><b>4 Physical Structures</b></a>.</p> <h5><a name="dtd" id="dtd"/>Document Type Definition</h5><table class="scrap" summary="Scrap"><tbody><tr valign="baseline"><td><a name="NT-doctypedecl" id="NT-doctypedecl"/>[28] </td><td><code>doctypedecl</code></td><td> ::= </td><td><code>'<!DOCTYPE' <a href="#NT-S">S</a> |
468 | white space should be preserved by applications. In valid documents, this | 468 | <a href="#NT-Name">Name</a> |
469 | attribute, like any other, <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> be <a title="Attribute-List Declaration" href="#dt-attdecl">declared</a> | 469 | (<a href="#NT-S">S</a> |
470 | if it is used. When declared, it <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> be given as an <a title="Enumerated Attribute
Values" href="#dt-enumerated">enumerated | 470 | <a href="#NT-ExternalID">ExternalID</a>)? <a href="#NT-S">S</a>? |
471 | type</a> whose values | 471 | ('[' <a href="#NT-intSubset">intSubset</a> ']' <a href="#NT-S">S</a>?)? '>'</code></td><td><a href="#vc-roottype">[VC: Root Element Type]</a></td></tr><tr valign="baseline"><td/><td/><td/><td/><td><a href="#ExtSubset">[WFC: External Subset]</a></td></tr><tr valign="baseline"><td><a name="NT-DeclSep" id="NT-DeclSep"/>[28a] </td><td><code>DeclSep</code></td><td> ::= </td><td><code> |
472 | are one or both of "default" and "preserve". | 472 | <a href="#NT-PEReference">PEReference</a> | <a href="#NT-S">S</a> |
473 | For example:</p><div class="exampleInner"><pre><!ATTLIST poem xml:space (default|preserve) 'preserve'> | 473 | </code></td><td><a href="#PE-between-Decls">[WFC: PE Between Declarations]</a></td></tr><tr valign="baseline"><td><a name="NT-intSubset" id="NT-intSubset"/>[28b] </td><td><code>intSubset</code></td><td> ::= </td><td><code>(<a href="#NT-markupdecl">markupdecl</a> | <a href="#NT-DeclSep">DeclSep</a>)*</code></td></tr><tr valign="baseline"><td><a name="NT-markupdecl" id="NT-markupdecl"/>[29] </td><td><code>markupdecl</code></td><td> ::= </td><td><code> |
474 | <!ATTLIST pre xml:space (preserve) #FIXED 'preserve'></pre></div><p>The value "default" signals that applications' default white-space | 474 | <a href="#NT-elementdecl">elementdecl</a> | <a href="#NT-AttlistDecl">AttlistDecl</a> | <a href="#NT-EntityDecl">EntityDecl</a> |
475 | processing modes are acceptable for this element; the value "preserve" | 475 | | <a href="#NT-NotationDecl">NotationDecl</a> | <a href="#NT-PI">PI</a> | <a href="#NT-Comment">Comment</a> |
476 | indicates the intent that applications preserve all the white space. This | 476 | </code></td><td><a href="#vc-PEinMarkupDecl">[VC: Proper Declaration/PE Nesting]</a></td></tr><tr valign="baseline"><td/><td/><td/><td/><td><a href="#wfc-PEinInternalSubset">[WFC: PEs in Internal Subset]</a></td></tr></tbody></table><p>Note |
477 | declared intent is considered to apply to all elements within the content | 477 | that it is possible to construct a well-formed document containing a <a href="#NT-doctypedecl">doctypedecl</a> |
478 | of the element where it is specified, unless <span>overridden</span> with | 478 | that neither points to an external subset nor contains an internal subset.</p><p>The markup declarations <span>may</span> be made up in whole or in part of the <a title="Replacement Text" href="#dt-repltext">replacement text</a> of <a title="Parameter entity" href="#dt-PE">parameter |
479 | another instance of the <code>xml:space</code> attribute. <span>This specification does not give meaning to any value of <code>xml:space</code> other than "default" and "preserve". It is an error for other values to be specified; the XML processor <em class="rfc2119" title="Keyword in RFC 2119 context">MAY</em> report the error or <em class="rfc2119" title="Keyword in RFC 2119 context">MAY</em> recover by ignoring the attribute specification or by reporting the (erroneous) value to the application. Applications may ignore or reject erroneous values.</span></p><p>The <a title="Root Element" href="#dt-root">root element</a> of any document is considered | 479 | entities</a>. The productions later in this specification for individual |
480 | to have signaled no intentions as regards application space handling, unless | 480 | nonterminals (<a href="#NT-elementdecl">elementdecl</a>, <a href="#NT-AttlistDecl">AttlistDecl</a>, |
481 | it provides a value for this attribute or the attribute is declared with a | 481 | and so on) describe the declarations <em>after</em> all the parameter |
482 | default value.</p></div><div class="div2"> <h3><a name="sec-line-ends" id="sec-line-ends" />2.11 End-of-Line Handling</h3><p>XML <a title="Text Entity" href="#dt-parsedent">parsed entities</a> are often stored | 482 | entities have been <a title="Include" href="#dt-include">included</a>.</p><p>Parameter |
483 | in computer files which, for editing convenience, are organized into lines. | 483 | entity references are recognized anywhere in the DTD (internal and external |
484 | These lines are typically separated by some combination of the characters | 484 | subsets and external parameter entities), except in literals, processing instructions, |
485 | CARRIAGE RETURN (#xD) and LINE FEED (#xA).</p><p>To | 485 | comments, and the contents of ignored conditional sections (see <a href="#sec-condition-sect"><b>3.4 Conditional Sections</b></a>). |
486 | simplify the tasks of <a title="Application" href="#dt-app">applications</a>, the | 486 | They are also recognized in entity value literals. The use of parameter entities |
487 | <span><a title="XML Processor" href="#dt-xml-proc">XML | 487 | in the internal subset is restricted as described below.</p><div class="constraint"><p class="prefix"><a name="vc-roottype" id="vc-roottype"/><b>Validity constraint: Root Element Type</b></p><p>The <a href="#NT-Name">Name</a> |
488 | processor</a> <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> behave as if it</span> normalized all line breaks in external parsed | 488 | in the document type declaration <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> match the element type of the <a title="Root Element" href="#dt-root">root element</a>.</p></div><div class="constraint"><p class="prefix"><a name="vc-PEinMarkupDecl" id="vc-PEinMarkupDecl"/><b>Validity constraint: Proper Declaration/PE Nesting</b></p><p>Parameter-entity <a title="Replacement Text" href="#dt-repltext">replacement text</a> |
489 | entities (including the document entity) on input, before parsing, by translating | 489 | <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> be properly nested with markup declarations. That is to say, if either |
490 | 490 | the first character or the last character of a markup declaration (<a href="#NT-markupdecl">markupdecl</a> | |
491 | <span>all of the following to a single #xA character:</span></p><ol type="1"><li><p>the two-character sequence #xD #xA</p></li><li><p>the two-character sequence #xD #x85</p></li><li><p>the single character #x85</p></li><li><p>the single character #x2028</p></li><li><p>any #xD character that is not immediately followed by #xA or #x85.</p></li></ol><p> The characters #x85 and #x2028 cannot be reliably recognized and | 491 | above) is contained in the replacement text for a <a title="Parameter-entity reference" href="#dt-PERef">parameter-entity |
492 | translated until an entity's encoding declaration (if present) has | 492 | reference</a>, both <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> be contained in the same replacement text.</p></div><div class="constraint"><p class="prefix"><a name="wfc-PEinInternalSubset" id="wfc-PEinInternalSubset"/><b>Well-formedness constraint: PEs in Internal Subset</b></p><p>In |
493 | been read. Therefore, it is a fatal error to use them within the XML | 493 | the internal DTD subset, <a title="Parameter-entity reference" href="#dt-PERef">parameter-entity references</a> |
494 | declaration or text declaration. | 494 | <em class="rfc2119" title="Keyword in RFC 2119 context">MUST NOT</em> occur within markup declarations; they <span>may</span> occur where markup declarations can occur. |
495 | </p></div><div class="div2"> <h3><a name="sec-lang-tag" id="sec-lang-tag" />2.12 Language Identification</h3><p>In document processing, it is often useful to identify the natural or formal | 495 | (This does not apply to references that occur in external parameter entities |
496 | language in which the content is written. A special <a title="Attribute" href="#dt-attr">attribute</a> | 496 | or to the external subset.)</p></div><div class="constraint"><p class="prefix"><a name="ExtSubset" id="ExtSubset"/><b>Well-formedness constraint: External Subset</b></p><p>The external subset, if any, <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> match the production for <a href="#NT-extSubset">extSubset</a>.</p></div><div class="constraint"><p class="prefix"><a name="PE-between-Decls" id="PE-between-Decls"/><b>Well-formedness constraint: PE Between Declarations</b></p><p>The replacement text of a parameter entity reference |
497 | named <code>xml:lang</code> <em class="rfc2119" title="Keyword in RFC 2119 context">MAY</em> be inserted in documents to specify the language | 497 | in a <a href="#NT-DeclSep">DeclSep</a> |
498 | used in the contents and attribute values of any element in an XML document. | 498 | <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> match the production <a href="#NT-extSubsetDecl">extSubsetDecl</a>.</p></div><p>Like the internal subset, the external subset and any external parameter |
499 | In valid documents, this attribute, like any other, <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> be <a title="Attribute-List Declaration" href="#dt-attdecl">declared</a> | 499 | entities referenced |
500 | if it is used. The | 500 | in a <a href="#NT-DeclSep">DeclSep</a> |
501 | values of the attribute are language identifiers as defined by <a href="#RFC1766">[IETF RFC 3066]</a>, <cite>Tags | 501 | <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> consist of a series of |
502 | for the Identification of Languages</cite>, or its successor<span>; in addition, the empty string <em class="rfc2119" title="Keyword in RFC 2119 context">MAY</em> be specified</span>.</p><p>(Productions 33 through 38 have been removed.)</p><p>For example:</p><div class="exampleInner"><pre><p xml:lang="en">The quick brown fox jumps over the lazy dog.</p> | 502 | complete markup declarations of the types allowed by the non-terminal symbol <a href="#NT-markupdecl">markupdecl</a>, interspersed with white space or <a title="Parameter-entity reference" href="#dt-PERef">parameter-entity references</a>. However, portions of |
503 | <p xml:lang="en-GB">What colour is it?</p> | 503 | the contents of the external subset or of these |
504 | <p xml:lang="en-US">What color is it?</p> | 504 | external parameter entities <span>may</span> conditionally be ignored by using the <a title="conditional section" href="#dt-cond-section">conditional section</a> construct; this is not |
505 | <sp who="Faust" desc='leise' xml:lang="de"> | 505 | allowed in the internal subset but is |
506 | <l>Habe nun, ach! Philosophie,</l> | 506 | allowed in external parameter entities referenced in the internal subset.</p> <h5><a name="ext-Subset" id="ext-Subset"/>External Subset</h5><table class="scrap" summary="Scrap"><tbody><tr valign="baseline"><td><a name="NT-extSubset" id="NT-extSubset"/>[30] </td><td><code>extSubset</code></td><td> ::= </td><td><code> |
507 | <l>Juristerei, und Medizin</l> | 507 | <a href="#NT-TextDecl">TextDecl</a>? <a href="#NT-extSubsetDecl">extSubsetDecl</a> |
508 | <l>und leider auch Theologie</l> | 508 | </code></td></tr><tr valign="baseline"><td><a name="NT-extSubsetDecl" id="NT-extSubsetDecl"/>[31] </td><td><code>extSubsetDecl</code></td><td> ::= </td><td><code>( <a href="#NT-markupdecl">markupdecl</a> | <a href="#NT-conditionalSect">conditionalSect</a> | <a href="#NT-DeclSep">DeclSep</a>)*</code></td></tr></tbody></table><p>The external subset and external parameter entities also differ from the |
509 | <l>durchaus studiert mit hei&#xDF;em Bem&#xFC;h'n.</l> | 509 | internal subset in that in them, <a title="Parameter-entity reference" href="#dt-PERef">parameter-entity |
510 | </sp></pre></div><p>The intent declared with <code>xml:lang</code> is considered to apply to | 510 | references</a> are permitted <em>within</em> markup declarations, |
511 | all attributes and content of the element where it is specified, unless overridden | 511 | not only <em>between</em> markup declarations.</p><p>An example of an XML document with a document type declaration:</p><div class="exampleInner"><pre><?xml version="1.1"?> |
512 | with an instance of <code>xml:lang</code> on another element within that content. <span>In particular, the empty value of <code>xml:lang</code> is used on an element B to override a specification of <code>xml:lang</code> on an enclosing element A, without specifying another language. Within B, it is considered that there is no language information available, just as if <code>xml:lang</code> had not been specified on B or any of its ancestors.</span></p><div class="note"><p class="prefix"><b>Note:</b></p><p>Language information may also be provided by external transport protocols (e.g. HTTP or | 512 | <!DOCTYPE greeting SYSTEM "hello.dtd"> |
513 | MIME). When available, this information may be used by XML applications, but the more local | 513 | <greeting>Hello, world!</greeting> </pre></div><p>The <a title="System Identifier" href="#dt-sysid">system identifier</a> |
514 | information provided by <code>xml:lang</code> should be considered to override it. | 514 | "<code>hello.dtd</code>" |
515 | </p></div><p>A simple declaration for <code>xml:lang</code> might take the form</p><div class="exampleInner"><pre>xml:lang <span>CDATA</span> #IMPLIED</pre></div><p>but specific default values <em class="rfc2119" title="Keyword in RFC 2119 context">MAY</em> also be given, if appropriate. In a collection | 515 | gives the address (a URI reference) of a DTD for the document.</p><p>The declarations can also be given locally, as in this example:</p><div class="exampleInner"><pre><?xml version="1.1" encoding="UTF-8" ?> |
516 | of French poems for English students, with glosses and notes in English, the <code>xml:lang</code> | 516 | <!DOCTYPE greeting [ |
517 | attribute might be declared this way:</p><div class="exampleInner"><pre><!ATTLIST poem xml:lang <span>CDATA</span> 'fr'> | 517 | <!ELEMENT greeting (#PCDATA)> |
518 | <!ATTLIST gloss xml:lang <span>CDATA</span> 'en'> | 518 | ]> |
519 | <!ATTLIST note xml:lang <span>CDATA</span> 'en'></pre></div></div><div class="div2"> <h3><a name="sec-normalization-checking" id="sec-normalization-checking" />2.13 Normalization Checking</h3><p>All XML <a title="Text Entity" href="#dt-parsedent"> parsed | 519 | <greeting>Hello, world!</greeting></pre></div><p>If both the external and internal subsets are used, the internal subset |
520 | entities</a> (including <a title="Document Entity" href="#dt-docent"> document | 520 | <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> be considered to occur before the external subset. |
521 | entities</a>) <em class="rfc2119" title="Keyword in RFC 2119 context">SHOULD</em> be <a title="fully normalized" href="#dt-fullnorm">fully | 521 | This has the effect that entity and attribute-list declarations in the internal |
522 | normalized</a> as per the definition of | 522 | subset take precedence over those in the external subset.</p><p> |
523 | <a href="#sec-CharNorm"><b>B Definitions for Character Normalization</b></a> supplemented by the following definitions of | 523 | If a document is well-formed or valid XML 1.0, and provided it |
524 | <em><a name="dt-relconst" id="dt-relconst" />relevant constructs</em> for XML:</p><ol type="1"><li><p>The <a title="Replacement Text" href="#dt-repltext"> | 524 | does not contain any control characters |
525 | replacement text</a> of all <a title="Text Entity" href="#dt-parsedent">parsed | 525 | in the range [#x7F-#x9F] other than as character escapes, it may be |
526 | entities</a></p></li><li><p>All text matching, in context, one of the following | 526 | made well-formed or valid XML 1.1 respectively simply by changing the |
527 | productions:</p><ol type="a"><li><p><a href="#NT-CData"> | 527 | version number.</p></div><div class="div2"> <h3><a name="sec-rmd" id="sec-rmd"/>2.9 Standalone Document Declaration</h3><p>Markup declarations can affect the content of the document, as passed from |
528 | CData</a></p></li><li><p><a href="#NT-CharData"> | 528 | an <a title="XML Processor" href="#dt-xml-proc">XML processor</a> to an application; examples |
529 | CharData</a></p></li><li><p><a href="#NT-content"> | 529 | are attribute defaults and entity declarations. The standalone document declaration, |
530 | content</a></p></li><li><p><a href="#NT-Name"> Name</a></p></li><li><p><a href="#NT-Nmtoken"> | 530 | which <span>may</span> appear as a component of the XML declaration, signals whether or |
531 | Nmtoken</a></p></li></ol></li></ol><p>However, a document is still well-formed even if it is not | 531 | not there are such declarations which appear external to the <a title="Document Entity" href="#dt-docent">document |
532 | <a title="fully normalized" href="#dt-fullnorm">fully normalized</a>. | 532 | entity</a> |
533 | XML processors <em class="rfc2119" title="Keyword in RFC 2119 context">SHOULD</em> provide a user option to verify that the document being | 533 | or in parameter entities. [<a name="dt-extmkpdecl" id="dt-extmkpdecl" title="External Markup Declaration">Definition</a>: An <b>external |
534 | processed is in <a title="fully normalized" href="#dt-fullnorm">fully normalized</a> form, and report to the application whether | 534 | markup declaration</b> is defined as a markup declaration occurring in |
535 | it is or not. The option to not verify <em class="rfc2119" title="Keyword in RFC 2119 context">SHOULD</em> be chosen only when the | 535 | the external subset or in a parameter entity (external or internal, the latter |
536 | input text is <a title="certified" href="#dt-certified">certified</a>, | 536 | being included because non-validating processors are not required to read |
537 | as defined by <a href="#sec-CharNorm"><b>B Definitions for Character Normalization</b></a>.</p><p>The verification of full normalization <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> be carried out as if by | 537 | them).] |
538 | first verifying that the entity is in <a title="include-normalized" href="#dt-inclnorm">include-normalized</a> | 538 | </p> <h5><a name="fulldtd" id="fulldtd"/>Standalone Document Declaration</h5><table class="scrap" summary="Scrap"><tbody><tr valign="baseline"><td><a name="NT-SDDecl" id="NT-SDDecl"/>[32] </td><td><code>SDDecl</code></td><td> ::= </td><td><code> |
539 | form as defined by <a href="#sec-CharNorm"><b>B Definitions for Character Normalization</b></a> and by then verifying that none of the relevant | 539 | <span> |
540 | constructs listed above begins (after character references are | 540 | |
541 | expanded) with a <a title="composing character" href="#dt-compchar">composing character</a> as defined by | 541 | <a href="#NT-S">S</a> |
542 | <a href="#sec-CharNorm"><b>B Definitions for Character Normalization</b></a>. | 542 | </span> 'standalone' <a href="#NT-Eq">Eq</a> |
543 | Non-validating processors <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> ignore possible | 543 | (("'" ('yes' | 'no') "'") | ('"' ('yes' | 'no') '"')) </code></td><td><a href="#vc-check-rmd">[VC: Standalone Document Declaration]</a></td></tr></tbody></table><p>In a standalone document declaration, the value "yes" indicates |
544 | denormalizations that would be caused by inclusion of external | 544 | that there are no <a title="External Markup Declaration" href="#dt-extmkpdecl">external markup declarations</a> which |
545 | entities that they do not read.</p><div class="note"><p class="prefix"><b>Note:</b></p><p>The <a title="composing character" href="#dt-compchar">composing character</a> are all | 545 | affect the information passed from the XML processor to the application. The |
546 | Unicode characters of non-zero combining class, plus a small number | 546 | value "no" indicates that there are or may be such external |
547 | of class-zero characters that nevertheless take part as a | 547 | markup declarations. Note that the standalone document declaration only denotes |
548 | non-initial character in certain Unicode canonical | 548 | the presence of external <em>declarations</em>; the presence, in a document, |
549 | decompositions. Since these characters are meant to follow | 549 | of references to external <em>entities</em>, when those entities are internally |
550 | base characters, restricting relevant constructs (including | 550 | declared, does not change its standalone status.</p><p>If there are no external markup declarations, the standalone document declaration |
551 | content) from beginning with a <a title="composing character" href="#dt-compchar">composing character</a> does not | 551 | has no meaning. If there are external markup declarations but there is no |
552 | meaningfully diminish the expressiveness of XML.</p></div><p>If, while verifying full normalization, a processor encounters | 552 | standalone document declaration, the value "no" is assumed.</p><p>Any XML document for which <code>standalone="no"</code> holds can be converted |
553 | characters for which it cannot determine the normalization | 553 | algorithmically to a standalone document, which may be desirable for some |
554 | properties (i.e., characters introduced in a version of Unicode <a href="#Unicode">[Unicode]</a> | 554 | network delivery applications.</p><div class="constraint"><p class="prefix"><a name="vc-check-rmd" id="vc-check-rmd"/><b>Validity constraint: Standalone Document Declaration</b></p><p>The |
555 | later than the one used in the implementation of the processor), | 555 | standalone document declaration <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> have the value "no" if |
556 | then the processor <em class="rfc2119" title="Keyword in RFC 2119 context">MAY</em>, at user option, ignore any possible | 556 | any external markup declarations contain declarations of:</p><ul><li><p>attributes with <a title="Attribute Default" href="#dt-default">default</a> values, |
557 | denormalizations caused by these characters. The option to ignore | 557 | if elements to which these attributes apply appear in the document without |
558 | those denormalizations <em class="rfc2119" title="Keyword in RFC 2119 context">SHOULD NOT</em> be chosen by applications when | 558 | specifications of values for these attributes, or</p></li><li><p>entities (other than <code>amp</code>, |
559 | reliability or security are critical.</p><p> XML processors <em class="rfc2119" title="Keyword in RFC 2119 context">MUST NOT</em> transform the input to be in | 559 | <code>lt</code>, |
560 | <a title="fully normalized" href="#dt-fullnorm">fully normalized</a> form. | 560 | <code>gt</code>, |
561 | XML applications that create XML 1.1 output | 561 | <code>apos</code>, |
562 | from either XML 1.1 or XML 1.0 input <em class="rfc2119" title="Keyword in RFC 2119 context">SHOULD</em> ensure that the output | 562 | <code>quot</code>), if <a title="Entity Reference" href="#dt-entref">references</a> |
563 | is <a title="fully normalized" href="#dt-fullnorm">fully normalized</a>; it is not necessary for internal processing | 563 | to those entities appear in the document, or</p></li><li><p>attributes with |
564 | forms to be <a title="fully normalized" href="#dt-fullnorm">fully normalized</a>.</p><p>The purpose of this section is to strongly encourage XML | 564 | tokenized types, where the |
565 | processors to ensure that the creators of XML documents have | 565 | attribute appears in the document with a value such that |
566 | properly normalized them, so that XML applications can make tests | 566 | <a href="#AVNormalize"><cite>normalization</cite></a> |
567 | such as identity comparisons of strings without having to worry | 567 | will produce a different value from that which would be produced |
568 | about the different possible "spellings" of strings which | 568 | in the absence of the declaration, or</p></li><li><p>element types with <a title="Element content" href="#dt-elemcontent">element content</a>, |
569 | Unicode allows. | 569 | if white space occurs directly within any instance of those types.</p></li></ul></div><p>An example XML declaration with a standalone document declaration:</p><div class="exampleInner"><pre><?xml version="1.1" standalone='yes'?></pre></div></div><div class="div2"> <h3><a name="sec-white-space" id="sec-white-space"/>2.10 White Space Handling</h3><p>In editing XML documents, it is often convenient to use "white space" |
570 | </p><p>When entities are in a non-Unicode encoding, if the processor | 570 | (spaces, tabs, and blank lines) |
571 | transcodes them to Unicode, it <em class="rfc2119" title="Keyword in RFC 2119 context">SHOULD</em> use a normalizing transcoder. | 571 | to set apart the markup for greater readability. Such white space is typically |
572 | </p></div></div><div class="div1"> <h2><a name="sec-logical-struct" id="sec-logical-struct" />3 Logical Structures</h2><p>[<a name="dt-element" id="dt-element" title="Element">Definition</a>: Each <a title="XML Document" href="#dt-xml-doc">XML | 572 | not intended for inclusion in the delivered version of the document. On the |
573 | document</a> contains one or more <b>elements</b>, the boundaries | 573 | other hand, "significant" white space that should be preserved |
574 | of which are either delimited by <a title="Start-Tag" href="#dt-stag">start-tags</a> | 574 | in the delivered version is common, for example in poetry and source code.</p><p>An <a title="XML Processor" href="#dt-xml-proc">XML processor</a> |
575 | and <a title="End Tag" href="#dt-etag">end-tags</a>, or, for <a title="Empty" href="#dt-empty">empty</a> | 575 | <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> always pass |
576 | elements, by an <a title="empty-element tag" href="#dt-eetag">empty-element tag</a>. Each | 576 | all characters in a document that are not markup through to the application. |
577 | element has a type, identified by name, sometimes called its "generic | 577 | A <a title="Validating Processor" href="#dt-validating"> validating XML processor</a> |
578 | identifier" (GI), and <em class="rfc2119" title="Keyword in RFC 2119 context">MAY</em> have a set of attribute specifications.] | 578 | <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> also |
579 | Each attribute specification has a <a title="Attribute Name" href="#dt-attrname">name</a> | 579 | inform the application which of these characters constitute white space appearing |
580 | and a <a title="Attribute Value" href="#dt-attrval">value</a>.</p> <h5><a name="IDATJ3S" id="IDATJ3S" />Element</h5><table class="scrap" summary="Scrap"><tbody><tr valign="baseline"><td><a name="NT-element" id="NT-element" />[39] </td><td><code>element</code></td><td> ::= </td><td><code><a href="#NT-EmptyElemTag">EmptyElemTag</a></code></td></tr><tr valign="baseline"><td /><td /><td /><td><code>| <a href="#NT-STag">STag</a> <a href="#NT-content">content</a> <a href="#NT-ETag">ETag</a></code></td><td><a href="#GIMatch">[WFC: Element Type Match]</a></td></tr><tr valign="baseline"><td /><td /><td /><td /><td><a href="#elementvalid">[VC: Element Valid]</a></td></tr></tbody></table><p>This specification does not constrain the semantics, use, or (beyond syntax) | 580 | in <a title="Element content" href="#dt-elemcontent">element content</a>.</p><p>A special <a title="Attribute" href="#dt-attr">attribute</a> named <code>xml:space</code> <span>may</span> be attached to an element to signal an intention that in that element, |
581 | names of the element types and attributes, except that names beginning with | 581 | white space should be preserved by applications. In valid documents, this |
582 | a match to <code>(('X'|'x')('M'|'m')('L'|'l'))</code> are reserved for standardization | 582 | attribute, like any other, <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> be <a title="Attribute-List Declaration" href="#dt-attdecl">declared</a> |
583 | in this or future versions of this specification.</p><div class="constraint"><p class="prefix"><a name="GIMatch" id="GIMatch" /><b>Well-formedness constraint: Element Type Match</b></p><p>The <a href="#NT-Name">Name</a> | 583 | if it is used. When declared, it <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> be given as an <a title="Enumerated Attribute
Values" href="#dt-enumerated">enumerated |
584 | in an element's end-tag <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> match the element type in the start-tag.</p></div><div class="constraint"><p class="prefix"><a name="elementvalid" id="elementvalid" /><b>Validity constraint: Element Valid</b></p><p>An element is valid | 584 | type</a> whose values |
585 | if there is a declaration matching <a href="#NT-elementdecl">elementdecl</a> | 585 | are one or both of "default" and "preserve". |
586 | where the <a href="#NT-Name">Name</a> matches the element type, and one of | 586 | For example:</p><div class="exampleInner"><pre><!ATTLIST poem xml:space (default|preserve) 'preserve'> |
587 | the following holds:</p><ol type="1"><li><p>The declaration matches <b>EMPTY</b> and the element has no <a title="Content" href="#dt-content">content</a> <span>(not even entity | 587 | <!ATTLIST pre xml:space (preserve) #FIXED 'preserve'></pre></div><p>The value "default" signals that applications' default white-space |
588 | references, comments, PIs or white space)</span>.</p></li><li><p>The declaration matches <a href="#NT-children">children</a> and the | 588 | processing modes are acceptable for this element; the value "preserve" |
589 | sequence of <a title="Parent/Child" href="#dt-parentchild">child elements</a> belongs | 589 | indicates the intent that applications preserve all the white space. This |
590 | to the language generated by the regular expression in the content model, | 590 | declared intent is considered to apply to all elements within the content |
591 | with optional white space<span>, comments and | 591 | of the element where it is specified, unless overridden with |
592 | PIs (i.e. markup matching production [27] <a href="#NT-Misc">Misc</a>)</span> between the | 592 | another instance of the <code>xml:space</code> attribute. This specification does not give meaning to any value of <code>xml:space</code> other than "default" and "preserve". It is an error for other values to be specified; the XML processor <em class="rfc2119" title="Keyword in RFC 2119 context">MAY</em> report the error or <em class="rfc2119" title="Keyword in RFC 2119 context">MAY</em> recover by ignoring the attribute specification or by reporting the (erroneous) value to the application. Applications may ignore or reject erroneous values.</p><p>The <a title="Root Element" href="#dt-root">root element</a> of any document is considered |
593 | start-tag and the first child element, between child elements, or between | 593 | to have signaled no intentions as regards application space handling, unless |
594 | the last child element and the end-tag. Note that a CDATA section containing | 594 | it provides a value for this attribute or the attribute is declared with a |
595 | only white space <span>or a reference | 595 | default value.</p></div><div class="div2"> <h3><a name="sec-line-ends" id="sec-line-ends"/>2.11 End-of-Line Handling</h3><p>XML <a title="Text Entity" href="#dt-parsedent">parsed entities</a> are often stored |
596 | to an entity whose replacement text is character references expanding to white | 596 | in computer files which, for editing convenience, are organized into lines. |
597 | space</span> <span>do</span> not | 597 | These lines are typically separated by some combination of the characters |
598 | match the nonterminal <a href="#NT-S">S</a>, and | 598 | CARRIAGE RETURN (#xD) and LINE FEED (#xA).</p><p>To |
599 | hence cannot appear in these positions<span>; however, a | 599 | simplify the tasks of <a title="Application" href="#dt-app">applications</a>, the |
600 | reference to an internal entity with a literal value consisting of character | 600 | <a title="XML Processor" href="#dt-xml-proc">XML |
601 | references expanding to white space does match <a href="#NT-S">S</a>, since its | 601 | processor</a> |
602 | replacement text is the white space resulting from expansion of the character | 602 | <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> behave as if it normalized all line breaks in external parsed |
603 | references</span>.</p></li><li><p>The declaration matches <a href="#NT-Mixed">Mixed</a> and the content | 603 | entities (including the document entity) on input, before parsing, by translating |
604 | <span>(after replacing | 604 | all of the following to a single #xA character:</p><ol class="enumar"><li><p>the two-character sequence #xD #xA</p></li><li><p>the two-character sequence #xD #x85</p></li><li><p>the single character #x85</p></li><li><p>the single character #x2028</p></li><li><p>any #xD character that is not immediately followed by #xA or #x85.</p></li></ol><p> The characters #x85 and #x2028 cannot be reliably recognized and |
605 | any entity references with their replacement text)</span> consists of | 605 | translated until an entity's encoding declaration (if present) has |
606 | <a title="Character Data" href="#dt-chardata">character data</a><span>, | 606 | been read. Therefore, it is a fatal error to use them within the XML |
607 | <a title="Comment" href="#dt-comment">comments</a>, <a title="Processing instruction" href="#dt-pi">PIs</a></span> and <a title="Parent/Child" href="#dt-parentchild">child elements</a> whose types match names in the | 607 | declaration or text declaration. |
608 | content model.</p></li><li><p>The declaration matches <b>ANY</b>, and the | 608 | </p></div><div class="div2"> <h3><a name="sec-lang-tag" id="sec-lang-tag"/>2.12 Language Identification</h3><p>In document processing, it is often useful to identify the natural or formal |
609 | <span>content | 609 | language in which the content is written. A special <a title="Attribute" href="#dt-attr">attribute</a> |
610 | <span>(after replacing | 610 | named <code>xml:lang</code> <span>may</span> be inserted in documents to specify the language |
611 | any entity references with their replacement text)</span> | 611 | used in the contents and attribute values of any element in an XML document. |
612 | consists of character data and <a title="Parent/Child" href="#dt-parentchild">child elements</a> | 612 | In valid documents, this attribute, like any other, <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> be <a title="Attribute-List Declaration" href="#dt-attdecl">declared</a> |
613 | whose types</span> | 613 | if it is used. The |
614 | have been declared.</p></li></ol></div><div class="div2"> <h3><a name="sec-starttags" id="sec-starttags" />3.1 Start-Tags, End-Tags, and Empty-Element Tags</h3><p>[<a name="dt-stag" id="dt-stag" title="Start-Tag">Definition</a>: The beginning of every non-empty | 614 | values of the attribute are language identifiers as defined by <a href="#RFC1766">[IETF RFC 3066]</a>, <cite>Tags |
615 | XML element is marked by a <b>start-tag</b>.]</p> <h5><a name="IDA3O3S" id="IDA3O3S" />Start-tag</h5><table class="scrap" summary="Scrap"><tbody><tr valign="baseline"><td><a name="NT-STag" id="NT-STag" />[40] </td><td><code>STag</code></td><td> ::= </td><td><code>'<' <a href="#NT-Name">Name</a> (<a href="#NT-S">S</a> <a href="#NT-Attribute">Attribute</a>)* <a href="#NT-S">S</a>? '>'</code></td><td><a href="#uniqattspec">[WFC: Unique Att Spec]</a></td></tr><tr valign="baseline"><td><a name="NT-Attribute" id="NT-Attribute" />[41] </td><td><code>Attribute</code></td><td> ::= </td><td><code><a href="#NT-Name">Name</a> <a href="#NT-Eq">Eq</a> <a href="#NT-AttValue">AttValue</a></code></td><td><a href="#ValueType">[VC: Attribute Value Type]</a></td></tr><tr valign="baseline"><td /><td /><td /><td /><td><a href="#NoExternalRefs">[WFC: No External Entity References]</a></td></tr><tr valign="baseline"><td /><td /><td /><td /><td><a href="#CleanAttrVals">[WFC: No < in Attribute Values]</a></td></tr></tbody></table><p>The <a href="#NT-Name">Name</a> in the start- and end-tags gives the element's <b>type</b>. [<a name="dt-attr" id="dt-attr" title="Attribute">Definition</a>: The <a href="#NT-Name">Name</a>-<a href="#NT-AttValue">AttValue</a> | 615 | for the Identification of Languages</cite>, or its successor; in addition, the empty string <span>may</span> be specified.</p><p>(Productions 33 through 38 have been removed.)</p><p>For example:</p><div class="exampleInner"><pre><p xml:lang="en">The quick brown fox jumps over the lazy dog.</p> |
616 | pairs are referred to as the <b>attribute specifications</b> of the | 616 | <p xml:lang="en-GB">What colour is it?</p> |
617 | element], [<a name="dt-attrname" id="dt-attrname" title="Attribute Name">Definition</a>: with the <a href="#NT-Name">Name</a> in each pair referred to as the <b>attribute name</b>] | 617 | <p xml:lang="en-US">What color is it?</p> |
618 | and [<a name="dt-attrval" id="dt-attrval" title="Attribute Value">Definition</a>: the content of the <a href="#NT-AttValue">AttValue</a> (the text between the <code>'</code> or <code>"</code> | 618 | <sp who="Faust" desc='leise' xml:lang="de"> |
619 | delimiters) as the <b>attribute value</b>.] Note | 619 | <l>Habe nun, ach! Philosophie,</l> |
620 | that the order of attribute specifications in a start-tag or empty-element | 620 | <l>Juristerei, und Medizin</l> |
621 | tag is not significant.</p><div class="constraint"><p class="prefix"><a name="uniqattspec" id="uniqattspec" /><b>Well-formedness constraint: Unique Att Spec</b></p><p><span class="mustard">An attribute name | 621 | <l>und leider auch Theologie</l> |
622 | <em class="rfc2119" title="Keyword in RFC 2119 context">MUST NOT</em></span> appear more than once in the same start-tag or empty-element tag.</p></div><div class="constraint"><p class="prefix"><a name="ValueType" id="ValueType" /><b>Validity constraint: Attribute Value Type</b></p><p>The attribute <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> | 622 | <l>durchaus studiert mit hei&#xDF;em Bem&#xFC;h'n.</l> |
623 | have been declared; the value <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> be of the type declared for it. (For attribute | 623 | </sp></pre></div><p>The <span> |
624 | types, see <a href="#attdecls"><b>3.3 Attribute-List Declarations</b></a>.)</p></div><div class="constraint"><p class="prefix"><a name="NoExternalRefs" id="NoExternalRefs" /><b>Well-formedness constraint: No External Entity References</b></p><p>Attribute | 624 | language specified by</span> |
625 | values <span class="mustard"><em class="rfc2119" title="Keyword in RFC 2119 context">MUST NOT</em></span> contain direct or indirect entity references to external entities.</p></div><div class="constraint"><p class="prefix"><a name="CleanAttrVals" id="CleanAttrVals" /><b>Well-formedness constraint: No <code><</code> in Attribute Values</b></p><p>The <a title="Replacement Text" href="#dt-repltext">replacement text</a> of any entity | 625 | <code>xml:lang</code> |
626 | referred to directly or indirectly in an attribute value <em class="rfc2119" title="Keyword in RFC 2119 context">MUST NOT</em> contain a <code><</code>.</p></div><p>An example of a start-tag:</p><div class="exampleInner"><pre><termdef id="dt-dog" term="dog"></pre></div><p>[<a name="dt-etag" id="dt-etag" title="End Tag">Definition</a>: The end of every element that begins | 626 | |
627 | with a start-tag <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> be marked by an <b>end-tag</b> containing a name | 627 | <span> |
628 | that echoes the element's type as given in the start-tag:]</p> <h5><a name="IDA3U3S" id="IDA3U3S" />End-tag</h5><table class="scrap" summary="Scrap"><tbody><tr valign="baseline"><td><a name="NT-ETag" id="NT-ETag" />[42] </td><td><code>ETag</code></td><td> ::= </td><td><code>'</' <a href="#NT-Name">Name</a> <a href="#NT-S">S</a>? | 628 | applies</span> |
629 | '>'</code></td></tr></tbody></table><p>An example of an end-tag:</p><div class="exampleInner"><pre></termdef></pre></div><p>[<a name="dt-content" id="dt-content" title="Content">Definition</a>: The <a title="Text" href="#dt-text">text</a> | 629 | to the element where it is specified<span> |
630 | between the start-tag and end-tag is called the element's <b>content</b>:]</p> <h5><a name="IDAKW3S" id="IDAKW3S" />Content of Elements</h5><table class="scrap" summary="Scrap"><tbody><tr valign="baseline"><td><a name="NT-content" id="NT-content" />[43] </td><td><code>content</code></td><td> ::= </td><td><code><a href="#NT-CharData">CharData</a>? ((<a href="#NT-element">element</a> | 630 | |
631 | | <a href="#NT-Reference">Reference</a> | <a href="#NT-CDSect">CDSect</a> | 631 | (including the values of its attributes), and to all elements in its content</span> unless |
632 | | <a href="#NT-PI">PI</a> | <a href="#NT-Comment">Comment</a>) <a href="#NT-CharData">CharData</a>?)*</code></td></tr></tbody></table><p>[<a name="dt-empty" id="dt-empty" title="Empty">Definition</a>: An element | 632 | overridden with <span> |
633 | with no <a href="#NT-content">content</a> is said to be <b>empty</b>.] The representation | 633 | another</span> instance of <code>xml:lang</code>. In particular, the empty value of <code>xml:lang</code> is used on an element B to override |
634 | of an empty element is either a start-tag immediately followed by an end-tag, | 634 | a specification of <code>xml:lang</code> on an enclosing element A, without specifying another language. Within B, |
635 | or an empty-element tag. [<a name="dt-eetag" id="dt-eetag" title="empty-element tag">Definition</a>: An <b>empty-element | 635 | it is considered that there is no language information available, just as if <code>xml:lang</code> had not been specified |
636 | tag</b> takes a special form:]</p> <h5><a name="IDARY3S" id="IDARY3S" />Tags for Empty Elements</h5><table class="scrap" summary="Scrap"><tbody><tr valign="baseline"><td><a name="NT-EmptyElemTag" id="NT-EmptyElemTag" />[44] </td><td><code>EmptyElemTag</code></td><td> ::= </td><td><code>'<' <a href="#NT-Name">Name</a> (<a href="#NT-S">S</a> <a href="#NT-Attribute">Attribute</a>)* <a href="#NT-S">S</a>? '/>'</code></td><td><a href="#uniqattspec">[WFC: Unique Att Spec]</a></td></tr></tbody></table><p>Empty-element tags <em class="rfc2119" title="Keyword in RFC 2119 context">MAY</em> be used for any element which has no content, whether | 636 | on B or any of its ancestors.<span> |
637 | or not it is declared using the keyword <b>EMPTY</b>. <a title="For interoperability" href="#dt-interop">For | 637 | Applications determine which of an element's attribute values |
638 | interoperability</a>, the empty-element tag <em class="rfc2119" title="Keyword in RFC 2119 context">SHOULD</em> | 638 | and which parts of its character content, if any, are treated as language-dependent values described by <code>xml:lang</code>.</span> |
639 | be used, and <em class="rfc2119" title="Keyword in RFC 2119 context">SHOULD</em> only be used, for elements which are declared | 639 | </p><div class="note"><p class="prefix"><b>Note:</b></p><p>Language information may also be provided by external transport protocols (e.g. HTTP or |
640 | EMPTY.</p><p>Examples of empty elements:</p><div class="exampleInner"><pre><IMG align="left" | 640 | MIME). When available, this information may be used by XML applications, but the more local |
641 | src="http://www.w3.org/Icons/WWW/w3c_home" /> | 641 | information provided by <code>xml:lang</code> should be considered to override it. |
642 | <br></br> | 642 | </p></div><p>A simple declaration for <code>xml:lang</code> might take the form</p><div class="exampleInner"><pre>xml:lang CDATA #IMPLIED</pre></div><p>but specific default values <span>may</span> also be given, if appropriate. In a collection |
643 | <br/></pre></div></div><div class="div2"> <h3><a name="elemdecls" id="elemdecls" />3.2 Element Type Declarations</h3><p>The <a title="Element" href="#dt-element">element</a> structure of an <a title="XML Document" href="#dt-xml-doc">XML document</a> <em class="rfc2119" title="Keyword in RFC 2119 context">MAY</em>, for <a title="Validity" href="#dt-valid">validation</a> | 643 | of French poems for English students, with glosses and notes in English, the <code>xml:lang</code> |
644 | purposes, be constrained using element type and attribute-list declarations. | 644 | attribute might be declared this way:</p><div class="exampleInner"><pre><!ATTLIST poem xml:lang CDATA 'fr'> |
645 | An element type declaration constrains the element's <a title="Content" href="#dt-content">content</a>.</p><p>Element type declarations often constrain which element types can appear | 645 | <!ATTLIST gloss xml:lang CDATA 'en'> |
646 | as <a title="Parent/Child" href="#dt-parentchild">children</a> of the element. At user | 646 | <!ATTLIST note xml:lang CDATA 'en'></pre></div></div><div class="div2"> <h3><a name="sec-normalization-checking" id="sec-normalization-checking"/>2.13 Normalization Checking</h3><p>All XML <a title="Text Entity" href="#dt-parsedent"> parsed |
647 | option, an XML processor <em class="rfc2119" title="Keyword in RFC 2119 context">MAY</em> issue a warning when a declaration mentions an | 647 | entities</a> (including <a title="Document Entity" href="#dt-docent"> document |
648 | element type for which no declaration is provided, but this is not an error.</p><p>[<a name="dt-eldecl" id="dt-eldecl" title="Element Type declaration">Definition</a>: An <b>element | 648 | entities</a>) <em class="rfc2119" title="Keyword in RFC 2119 context">SHOULD</em> be <a title="fully normalized" href="#dt-fullnorm">fully |
649 | type declaration</b> takes the form:]</p> <h5><a name="IDAV13S" id="IDAV13S" />Element Type Declaration</h5><table class="scrap" summary="Scrap"><tbody><tr valign="baseline"><td><a name="NT-elementdecl" id="NT-elementdecl" />[45] </td><td><code>elementdecl</code></td><td> ::= </td><td><code>'<!ELEMENT' <a href="#NT-S">S</a> <a href="#NT-Name">Name</a> <a href="#NT-S">S</a> <a href="#NT-contentspec">contentspec</a> <a href="#NT-S">S</a>? | 649 | normalized</a> as per the definition of |
650 | '>'</code></td><td><a href="#EDUnique">[VC: Unique Element Type Declaration]</a></td></tr><tr valign="baseline"><td><a name="NT-contentspec" id="NT-contentspec" />[46] </td><td><code>contentspec</code></td><td> ::= </td><td><code>'EMPTY' | 'ANY' | <a href="#NT-Mixed">Mixed</a> | 650 | <a href="#sec-CharNorm"><b>B Definitions for Character Normalization</b></a> supplemented by the following definitions of |
651 | | <a href="#NT-children">children</a></code></td></tr></tbody></table><p>where the <a href="#NT-Name">Name</a> gives the element type being declared.</p><div class="constraint"><p class="prefix"><a name="EDUnique" id="EDUnique" /><b>Validity constraint: Unique Element Type Declaration</b></p><p><span class="mustard">An element | 651 | <em><a name="dt-relconst" id="dt-relconst"/>relevant constructs</em> for XML:</p><ol class="enumar"><li><p>The <a title="Replacement Text" href="#dt-repltext"> |
652 | type <em class="rfc2119" title="Keyword in RFC 2119 context">MUST NOT</em></span> be declared more than once.</p></div><p>Examples of element type declarations:</p><div class="exampleInner"><pre><!ELEMENT br EMPTY> | 652 | replacement text</a> of all <a title="Text Entity" href="#dt-parsedent">parsed |
653 | <!ELEMENT p (#PCDATA|emph)* > | 653 | entities</a> |
654 | <!ELEMENT %name.para; %content.para; > | 654 | </p></li><li><p>All text matching, in context, one of the following |
655 | <!ELEMENT container ANY></pre></div><div class="div3"> <h4><a name="sec-element-content" id="sec-element-content" />3.2.1 Element Content</h4><p>[<a name="dt-elemcontent" id="dt-elemcontent" title="Element content">Definition</a>: An element <a title="Start-Tag" href="#dt-stag">type</a> has <b>element content</b> when elements | 655 | productions:</p><ol class="enumla"><li><p> |
656 | of that type <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> contain only <a title="Parent/Child" href="#dt-parentchild">child</a> | 656 | <a href="#NT-CData"> |
657 | elements (no character data), optionally separated by white space (characters | 657 | CData</a> |
658 | matching the nonterminal <a href="#NT-S">S</a>).] [<a name="dt-content-model" id="dt-content-model" title="Content model">Definition</a>: In this case, the constraint includes a <b>content | 658 | </p></li><li><p> |
659 | model</b>, a simple grammar governing the allowed types of the | 659 | <a href="#NT-CharData"> |
660 | child elements and the order in which they are allowed to appear.] | 660 | CharData</a> |
661 | The grammar is built on content particles (<a href="#NT-cp">cp</a>s), which | 661 | </p></li><li><p> |
662 | consist of names, choice lists of content particles, or sequence lists of | 662 | <a href="#NT-content"> |
663 | content particles:</p> <h5><a name="IDAP53S" id="IDAP53S" />Element-content Models</h5><table class="scrap" summary="Scrap"><tbody><tr valign="baseline"><td><a name="NT-children" id="NT-children" />[47] </td><td><code>children</code></td><td> ::= </td><td><code>(<a href="#NT-choice">choice</a> | <a href="#NT-seq">seq</a>) | 663 | content</a> |
664 | ('?' | '*' | '+')?</code></td></tr><tr valign="baseline"><td><a name="NT-cp" id="NT-cp" />[48] </td><td><code>cp</code></td><td> ::= </td><td><code>(<a href="#NT-Name">Name</a> | <a href="#NT-choice">choice</a> | 664 | </p></li><li><p> |
665 | | <a href="#NT-seq">seq</a>) ('?' | '*' | '+')?</code></td></tr><tr valign="baseline"><td><a name="NT-choice" id="NT-choice" />[49] </td><td><code>choice</code></td><td> ::= </td><td><code>'(' <a href="#NT-S">S</a>? <a href="#NT-cp">cp</a> ( <a href="#NT-S">S</a>? '|' <a href="#NT-S">S</a>? <a href="#NT-cp">cp</a> )+ <a href="#NT-S">S</a>? ')'</code></td><td><a href="#vc-PEinGroup">[VC: Proper Group/PE Nesting]</a></td></tr><tr valign="baseline"><td><a name="NT-seq" id="NT-seq" />[50] </td><td><code>seq</code></td><td> ::= </td><td><code>'(' <a href="#NT-S">S</a>? <a href="#NT-cp">cp</a> ( <a href="#NT-S">S</a>? ',' <a href="#NT-S">S</a>? <a href="#NT-cp">cp</a> )* <a href="#NT-S">S</a>? ')'</code></td><td><a href="#vc-PEinGroup">[VC: Proper Group/PE Nesting]</a></td></tr></tbody></table><p>where each <a href="#NT-Name">Name</a> is the type of an element which | 665 | <a href="#NT-Name"> Name</a> |
666 | <em class="rfc2119" title="Keyword in RFC 2119 context">MAY</em> appear as a <a title="Parent/Child" href="#dt-parentchild">child</a>. Any content | 666 | </p></li><li><p> |
667 | particle in a choice list <em class="rfc2119" title="Keyword in RFC 2119 context">MAY</em> appear in the <a title="Element content" href="#dt-elemcontent">element | 667 | <a href="#NT-Nmtoken"> |
668 | content</a> at the location where the choice list appears in the grammar; | 668 | Nmtoken</a> |
669 | content particles occurring in a sequence list <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> each appear in the <a title="Element content" href="#dt-elemcontent">element content</a> in the order given in the list. | 669 | </p></li></ol></li></ol><p>However, a document is still well-formed even if it is not |
670 | The optional character following a name or list governs whether the element | 670 | <a title="fully normalized" href="#dt-fullnorm">fully normalized</a>. |
671 | or the content particles in the list may occur one or more (<code>+</code>), | 671 | XML processors <em class="rfc2119" title="Keyword in RFC 2119 context">SHOULD</em> provide a user option to verify that the document being |
672 | zero or more (<code>*</code>), or zero or one times (<code>?</code>). The | 672 | processed is in <a title="fully normalized" href="#dt-fullnorm">fully normalized</a> form, and report to the application whether |
673 | absence of such an operator means that the element or content particle <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> | 673 | it is or not. The option to not verify <em class="rfc2119" title="Keyword in RFC 2119 context">SHOULD</em> be chosen only when the |
674 | appear exactly once. This syntax and meaning are identical to those used in | 674 | input text is <a title="certified" href="#dt-certified">certified</a>, |
675 | the productions in this specification.</p><p>The content of an element matches a content model if and only if it is | 675 | as defined by <a href="#sec-CharNorm"><b>B Definitions for Character Normalization</b></a>.</p><p>The verification of full normalization <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> be carried out as if by |
676 | possible to trace out a path through the content model, obeying the sequence, | 676 | first verifying that the entity is in <a title="include-normalized" href="#dt-inclnorm">include-normalized</a> |
677 | choice, and repetition operators and matching each element in the content | 677 | form as defined by <a href="#sec-CharNorm"><b>B Definitions for Character Normalization</b></a> and by then verifying that none of the relevant |
678 | against an element type in the content model. <a title="For Compatibility" href="#dt-compat">For | 678 | constructs listed above begins (after character references are |
679 | compatibility</a>, it is an error if <span>the content model | 679 | expanded) with a <a title="composing character" href="#dt-compchar">composing character</a> as defined by |
680 | allows an element to match more than one occurrence of an element type in the | 680 | <a href="#sec-CharNorm"><b>B Definitions for Character Normalization</b></a>. |
681 | content model</span>. For more information, see <a href="#determinism"><b>D Deterministic Content Models</b></a>.</p><div class="constraint"><p class="prefix"><a name="vc-PEinGroup" id="vc-PEinGroup" /><b>Validity constraint: Proper Group/PE Nesting</b></p><p>Parameter-entity <a title="Replacement Text" href="#dt-repltext">replacement text</a> <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> be properly nested with parenthesized | 681 | Non-validating processors <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> ignore possible |
682 | groups. That is to say, if either of the opening or closing parentheses in | 682 | denormalizations that would be caused by inclusion of external |
683 | a <a href="#NT-choice">choice</a>, <a href="#NT-seq">seq</a>, or <a href="#NT-Mixed">Mixed</a> | 683 | entities that they do not read.</p><div class="note"><p class="prefix"><b>Note:</b></p><p>The <a title="composing character" href="#dt-compchar">composing character</a> are all |
684 | construct is contained in the replacement text for a <a title="Parameter-entity reference" href="#dt-PERef">parameter | 684 | Unicode characters of non-zero combining class, plus a small number |
685 | entity</a>, both <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> be contained in the same replacement text.</p><p><a title="For interoperability" href="#dt-interop">For interoperability</a>, if a parameter-entity reference | 685 | of class-zero characters that nevertheless take part as a |
686 | appears in a <a href="#NT-choice">choice</a>, <a href="#NT-seq">seq</a>, or <a href="#NT-Mixed">Mixed</a> construct, its replacement text <em class="rfc2119" title="Keyword in RFC 2119 context">SHOULD</em> contain at | 686 | non-initial character in certain Unicode canonical |
687 | least one non-blank character, and neither the first nor last non-blank character | 687 | decompositions. Since these characters are meant to follow |
688 | of the replacement text <em class="rfc2119" title="Keyword in RFC 2119 context">SHOULD</em> be a connector (<code>|</code> or <code>,</code>).</p></div><p>Examples of element-content models:</p><div class="exampleInner"><pre><!ELEMENT spec (front, body, back?)> | 688 | base characters, restricting relevant constructs (including |
689 | <!ELEMENT div1 (head, (p | list | note)*, div2*)> | 689 | content) from beginning with a <a title="composing character" href="#dt-compchar">composing character</a> does not |
690 | <!ELEMENT dictionary-body (%div.mix; | %dict.mix;)*></pre></div></div><div class="div3"> <h4><a name="sec-mixed-content" id="sec-mixed-content" />3.2.2 Mixed Content</h4><p>[<a name="dt-mixed" id="dt-mixed" title="Mixed Content">Definition</a>: An element <a title="Start-Tag" href="#dt-stag">type</a> | 690 | meaningfully diminish the expressiveness of XML.</p></div><p>If, while verifying full normalization, a processor encounters |
691 | has <b>mixed content</b> when elements of that type <em class="rfc2119" title="Keyword in RFC 2119 context">MAY</em> contain character | 691 | characters for which it cannot determine the normalization |
692 | data, optionally interspersed with <a title="Parent/Child" href="#dt-parentchild">child</a> | 692 | properties (i.e., characters introduced in a version of Unicode <a href="#Unicode">[Unicode]</a> |
693 | elements.] In this case, the types of the child elements <em class="rfc2119" title="Keyword in RFC 2119 context">MAY</em> be constrained, | 693 | later than the one used in the implementation of the processor), |
694 | but not their order or their number of occurrences:</p> <h5><a name="IDAUHCU" id="IDAUHCU" />Mixed-content Declaration</h5><table class="scrap" summary="Scrap"><tbody><tr valign="baseline"><td><a name="NT-Mixed" id="NT-Mixed" />[51] </td><td><code>Mixed</code></td><td> ::= </td><td><code>'(' <a href="#NT-S">S</a>? '#PCDATA' (<a href="#NT-S">S</a>? | 694 | then the processor <em class="rfc2119" title="Keyword in RFC 2119 context">MAY</em>, at user option, ignore any possible |
695 | '|' <a href="#NT-S">S</a>? <a href="#NT-Name">Name</a>)* <a href="#NT-S">S</a>? | 695 | denormalizations caused by these characters. The option to ignore |
696 | ')*' </code></td></tr><tr valign="baseline"><td /><td /><td /><td><code>| '(' <a href="#NT-S">S</a>? '#PCDATA' <a href="#NT-S">S</a>? ')' </code></td><td><a href="#vc-PEinGroup">[VC: Proper Group/PE Nesting]</a></td></tr><tr valign="baseline"><td /><td /><td /><td /><td><a href="#vc-MixedChildrenUnique">[VC: No Duplicate Types]</a></td></tr></tbody></table><p>where the <a href="#NT-Name">Name</a>s give the types of elements that | 696 | those denormalizations <em class="rfc2119" title="Keyword in RFC 2119 context">SHOULD NOT</em> be chosen by applications when |
697 | may appear as children. The | 697 | reliability or security are critical.</p><p> XML processors <em class="rfc2119" title="Keyword in RFC 2119 context">MUST NOT</em> transform the input to be in |
698 | keyword <b>#PCDATA</b> derives historically from the term "parsed | 698 | <a title="fully normalized" href="#dt-fullnorm">fully normalized</a> form. |
699 | character data."</p><div class="constraint"><p class="prefix"><a name="vc-MixedChildrenUnique" id="vc-MixedChildrenUnique" /><b>Validity constraint: No Duplicate Types</b></p><p>The | 699 | XML applications that create XML 1.1 output |
700 | same name <em class="rfc2119" title="Keyword in RFC 2119 context">MUST NOT</em> appear more than once in a single mixed-content declaration.</p></div><p>Examples of mixed content declarations:</p><div class="exampleInner"><pre><!ELEMENT p (#PCDATA|a|ul|b|i|em)*> | 700 | from either XML 1.1 or XML 1.0 input <em class="rfc2119" title="Keyword in RFC 2119 context">SHOULD</em> ensure that the output |
701 | <!ELEMENT p (#PCDATA | %font; | %phrase; | %special; | %form;)* > | 701 | is <a title="fully normalized" href="#dt-fullnorm">fully normalized</a>; it is not necessary for internal processing |
702 | <!ELEMENT b (#PCDATA)></pre></div></div></div><div class="div2"> <h3><a name="attdecls" id="attdecls" />3.3 Attribute-List Declarations</h3><p><a title="Attribute" href="#dt-attr">Attributes</a> are used to associate name-value | 702 | forms to be <a title="fully normalized" href="#dt-fullnorm">fully normalized</a>.</p><p>The purpose of this section is to strongly encourage XML |
703 | pairs with <a title="Element" href="#dt-element">elements</a>. Attribute specifications | 703 | processors to ensure that the creators of XML documents have |
704 | <span class="mustard"><em class="rfc2119" title="Keyword in RFC 2119 context">MUST NOT</em> appear outside of</span> <a title="Start-Tag" href="#dt-stag">start-tags</a> and <a title="empty-element tag" href="#dt-eetag">empty-element tags</a>; thus, the productions used to | 704 | properly normalized them, so that XML applications can make tests |
705 | recognize them appear in <a href="#sec-starttags"><b>3.1 Start-Tags, End-Tags, and Empty-Element Tags</b></a>. Attribute-list declarations | 705 | such as identity comparisons of strings without having to worry |
706 | <em class="rfc2119" title="Keyword in RFC 2119 context">MAY</em> be used:</p><ul><li><p>To define the set of attributes pertaining to a given element type.</p></li><li><p>To establish type constraints for these attributes.</p></li><li><p>To provide <a title="Attribute Default" href="#dt-default">default values</a> for | 706 | about the different possible "spellings" of strings which |
707 | attributes.</p></li></ul><p>[<a name="dt-attdecl" id="dt-attdecl" title="Attribute-List Declaration">Definition</a>: <b>Attribute-list | 707 | Unicode allows. |
708 | declarations</b> specify the name, data type, and default value (if any) | 708 | </p><p>When entities are in a non-Unicode encoding, if the processor |
709 | of each attribute associated with a given element type:]</p> <h5><a name="IDADMCU" id="IDADMCU" />Attribute-list Declaration</h5><table class="scrap" summary="Scrap"><tbody><tr valign="baseline"><td><a name="NT-AttlistDecl" id="NT-AttlistDecl" />[52] </td><td><code>AttlistDecl</code></td><td> ::= </td><td><code>'<!ATTLIST' <a href="#NT-S">S</a> <a href="#NT-Name">Name</a> <a href="#NT-AttDef">AttDef</a>* <a href="#NT-S">S</a>? '>'</code></td></tr></tbody><tbody><tr valign="baseline"><td><a name="NT-AttDef" id="NT-AttDef" />[53] </td><td><code>AttDef</code></td><td> ::= </td><td><code><a href="#NT-S">S</a> <a href="#NT-Name">Name</a> <a href="#NT-S">S</a> <a href="#NT-AttType">AttType</a> <a href="#NT-S">S</a> <a href="#NT-DefaultDecl">DefaultDecl</a></code></td></tr></tbody></table><p>The <a href="#NT-Name">Name</a> in the <a href="#NT-AttlistDecl">AttlistDecl</a> | 709 | transcodes them to Unicode, it <em class="rfc2119" title="Keyword in RFC 2119 context">SHOULD</em> use a normalizing transcoder. |
710 | rule is the type of an element. At user option, an XML processor <em class="rfc2119" title="Keyword in RFC 2119 context">MAY</em> issue | 710 | </p></div></div><div class="div1"> <h2><a name="sec-logical-struct" id="sec-logical-struct"/>3 Logical Structures</h2><p> |
711 | a warning if attributes are declared for an element type not itself declared, | 711 | [<a name="dt-element" id="dt-element" title="Element">Definition</a>: Each <a title="XML Document" href="#dt-xml-doc">XML |
712 | but this is not an error. The <a href="#NT-Name">Name</a> in the <a href="#NT-AttDef">AttDef</a> | 712 | document</a> contains one or more <b>elements</b>, the boundaries |
713 | rule is the name of the attribute.</p><p>When more than one <a href="#NT-AttlistDecl">AttlistDecl</a> is provided | 713 | of which are either delimited by <a title="Start-Tag" href="#dt-stag">start-tags</a> |
714 | for a given element type, the contents of all those provided are merged. When | 714 | and <a title="End Tag" href="#dt-etag">end-tags</a>, or, for <a title="Empty" href="#dt-empty">empty</a> |
715 | more than one definition is provided for the same attribute of a given element | 715 | elements, by an <a title="empty-element tag" href="#dt-eetag">empty-element tag</a>. Each |
716 | type, the first declaration is binding and later declarations are ignored. <a title="For interoperability" href="#dt-interop">For interoperability,</a> writers of DTDs <em class="rfc2119" title="Keyword in RFC 2119 context">MAY</em> choose | 716 | element has a type, identified by name, sometimes called its "generic |
717 | to provide at most one attribute-list declaration for a given element type, | 717 | identifier" (GI), and <span>may</span> have a set of attribute specifications.] |
718 | at most one attribute definition for a given attribute name in an attribute-list | 718 | Each attribute specification has a <a title="Attribute Name" href="#dt-attrname">name</a> |
719 | declaration, and at least one attribute definition in each attribute-list | 719 | and a <a title="Attribute Value" href="#dt-attrval">value</a>.</p> <h5><a name="IDALUFS" id="IDALUFS"/>Element</h5><table class="scrap" summary="Scrap"><tbody><tr valign="baseline"><td><a name="NT-element" id="NT-element"/>[39] </td><td><code>element</code></td><td> ::= </td><td><code> |
720 | declaration. For interoperability, an XML processor <em class="rfc2119" title="Keyword in RFC 2119 context">MAY</em> at user option | 720 | <a href="#NT-EmptyElemTag">EmptyElemTag</a> |
721 | issue a warning when more than one attribute-list declaration is provided | 721 | </code></td></tr><tr valign="baseline"><td/><td/><td/><td><code>| <a href="#NT-STag">STag</a> |
722 | for a given element type, or more than one attribute definition is provided | 722 | <a href="#NT-content">content</a> |
723 | for a given attribute, but this is not an error.</p><div class="div3"> <h4><a name="sec-attribute-types" id="sec-attribute-types" />3.3.1 Attribute Types</h4><p>XML attribute types are of three kinds: a string type, a set of tokenized | 723 | <a href="#NT-ETag">ETag</a> |
724 | types, and enumerated types. The string type may take any literal string as | 724 | </code></td><td><a href="#GIMatch">[WFC: Element Type Match]</a></td></tr><tr valign="baseline"><td/><td/><td/><td/><td><a href="#elementvalid">[VC: Element Valid]</a></td></tr></tbody></table><p>This specification does not constrain the |
725 | a value; the tokenized types have varying lexical and semantic constraints. | 725 | <span> |
726 | The validity constraints noted in the grammar are applied after the attribute | 726 | application </span>semantics, use, or (beyond syntax) |
727 | value has been normalized as described in <span><a href="#AVNormalize"><b>3.3.3 Attribute-Value Normalization</b></a></span>.</p> <h5><a name="IDAPPCU" id="IDAPPCU" />Attribute Types</h5><table class="scrap" summary="Scrap"><tbody><tr valign="baseline"><td><a name="NT-AttType" id="NT-AttType" />[54] </td><td><code>AttType</code></td><td> ::= </td><td><code><a href="#NT-StringType">StringType</a> | <a href="#NT-TokenizedType">TokenizedType</a> | 727 | names of the element types and attributes, except that names beginning with |
728 | | <a href="#NT-EnumeratedType">EnumeratedType</a></code></td></tr><tr valign="baseline"><td><a name="NT-StringType" id="NT-StringType" />[55] </td><td><code>StringType</code></td><td> ::= </td><td><code>'CDATA'</code></td></tr><tr valign="baseline"><td><a name="NT-TokenizedType" id="NT-TokenizedType" />[56] </td><td><code>TokenizedType</code></td><td> ::= </td><td><code>'ID'</code></td><td><a href="#id">[VC: ID]</a></td></tr><tr valign="baseline"><td /><td /><td /><td /><td><a href="#one-id-per-el">[VC: One ID per Element Type]</a></td></tr><tr valign="baseline"><td /><td /><td /><td /><td><a href="#id-default">[VC: ID Attribute Default]</a></td></tr><tr valign="baseline"><td /><td /><td /><td><code>| 'IDREF'</code></td><td><a href="#idref">[VC: IDREF]</a></td></tr><tr valign="baseline"><td /><td /><td /><td><code>| 'IDREFS'</code></td><td><a href="#idref">[VC: IDREF]</a></td></tr><tr valign="baseline"><td /><td /><td /><td><code>| 'ENTITY'</code></td><td><a href="#entname">[VC: Entity Name]</a></td></tr><tr valign="baseline"><td /><td /><td /><td><code>| 'ENTITIES'</code></td><td><a href="#entname">[VC: Entity Name]</a></td></tr><tr valign="baseline"><td /><td /><td /><td><code>| 'NMTOKEN'</code></td><td><a href="#nmtok">[VC: Name Token]</a></td></tr><tr valign="baseline"><td /><td /><td /><td><code>| 'NMTOKENS'</code></td><td><a href="#nmtok">[VC: Name Token]</a></td></tr></tbody></table><div class="constraint"><p class="prefix"><a name="id" id="id" /><b>Validity constraint: ID</b></p><p>Values of type <b>ID</b> <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> match the <a href="#NT-Name">Name</a> production. A name <em class="rfc2119" title="Keyword in RFC 2119 context">MUST NOT</em> appear more than once | 728 | a match to <code>(('X'|'x')('M'|'m')('L'|'l'))</code> are reserved for standardization |
729 | in an XML document as a value of this type; i.e., ID values <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> uniquely | 729 | in this or future versions of this specification.</p><div class="constraint"><p class="prefix"><a name="GIMatch" id="GIMatch"/><b>Well-formedness constraint: Element Type Match</b></p><p>The <a href="#NT-Name">Name</a> |
730 | identify the elements which bear them.</p></div><div class="constraint"><p class="prefix"><a name="one-id-per-el" id="one-id-per-el" /><b>Validity constraint: One ID per Element Type</b></p><p><span class="mustard">An element | 730 | in an element's end-tag <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> match the element type in the start-tag.</p></div><div class="constraint"><p class="prefix"><a name="elementvalid" id="elementvalid"/><b>Validity constraint: Element Valid</b></p><p>An element is valid |
731 | type <em class="rfc2119" title="Keyword in RFC 2119 context">MUST NOT</em></span> have more than one ID attribute specified.</p></div><div class="constraint"><p class="prefix"><a name="id-default" id="id-default" /><b>Validity constraint: ID Attribute Default</b></p><p>An ID attribute | 731 | if there is a declaration matching <a href="#NT-elementdecl">elementdecl</a> |
732 | <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> have a declared default of <b>#IMPLIED</b> or <b>#REQUIRED</b>.</p></div><div class="constraint"><p class="prefix"><a name="idref" id="idref" /><b>Validity constraint: IDREF</b></p><p>Values of type <b>IDREF</b> <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> | 732 | where the <a href="#NT-Name">Name</a> matches the element type, and one of |
733 | match the <a href="#NT-Name">Name</a> production, and values of type <b>IDREFS</b> <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> match <a href="#NT-Names">Names</a>; each <a href="#NT-Name">Name</a> <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> match the value of an ID attribute on some element in the XML document; | 733 | the following holds:</p><ol class="enumar"><li><p>The declaration matches <b>EMPTY</b> and the element has no <a title="Content" href="#dt-content">content</a> (not even entity |
734 | i.e. <b>IDREF</b> values <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> match the value of some ID attribute.</p></div><div class="constraint"><p class="prefix"><a name="entname" id="entname" /><b>Validity constraint: Entity Name</b></p><p>Values of type <b>ENTITY</b> <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> match the <a href="#NT-Name">Name</a> production, values of type <b>ENTITIES</b> <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> match <a href="#NT-Names">Names</a>; each <a href="#NT-Name">Name</a> <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> match the name of an <a title="Unparsed Entity" href="#dt-unparsed">unparsed entity</a> | 734 | references, comments, PIs or white space).</p></li><li><p>The declaration matches <a href="#NT-children">children</a> and the |
735 | declared in the <a title="Document Type Declaration" href="#dt-doctype">DTD</a>.</p></div><div class="constraint"><p class="prefix"><a name="nmtok" id="nmtok" /><b>Validity constraint: Name Token</b></p><p>Values of type <b>NMTOKEN</b> <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> match the <a href="#NT-Nmtoken">Nmtoken</a> production; values of type <b>NMTOKENS</b> <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> match <a href="#NT-Nmtokens">Nmtokens</a>.</p></div><p>[<a name="dt-enumerated" id="dt-enumerated" title="Enumerated Attribute
Values">Definition</a>: <b>Enumerated attributes</b> <span class="mustard"><em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em></span> take one of a list of values | 735 | sequence of <a title="Parent/Child" href="#dt-parentchild">child elements</a> belongs |
736 | provided in the declaration]. There are two kinds of enumerated types:</p> <h5><a name="IDAHXCU" id="IDAHXCU" />Enumerated Attribute Types</h5><table class="scrap" summary="Scrap"><tbody><tr valign="baseline"><td><a name="NT-EnumeratedType" id="NT-EnumeratedType" />[57] </td><td><code>EnumeratedType</code></td><td> ::= </td><td><code><a href="#NT-NotationType">NotationType</a> | 736 | to the language generated by the regular expression in the content model, |
737 | | <a href="#NT-Enumeration">Enumeration</a></code></td></tr></tbody><tbody><tr valign="baseline"><td><a name="NT-NotationType" id="NT-NotationType" />[58] </td><td><code>NotationType</code></td><td> ::= </td><td><code>'NOTATION' <a href="#NT-S">S</a> '(' <a href="#NT-S">S</a>? <a href="#NT-Name">Name</a> (<a href="#NT-S">S</a>? '|' <a href="#NT-S">S</a>? <a href="#NT-Name">Name</a>)* <a href="#NT-S">S</a>? ')' </code></td><td><a href="#notatn">[VC: Notation Attributes]</a></td></tr><tr valign="baseline"><td /><td /><td /><td /><td><a href="#OneNotationPer">[VC: One Notation Per Element Type]</a></td></tr><tr valign="baseline"><td /><td /><td /><td /><td><a href="#NoNotationEmpty">[VC: No Notation on Empty Element]</a></td></tr><tr valign="baseline"><td /><td /><td /><td /><td><a href="#NoDuplicateTokens">[VC: No Duplicate | 737 | with optional white space, comments and |
738 | Tokens]</a></td></tr></tbody><tbody><tr valign="baseline"><td><a name="NT-Enumeration" id="NT-Enumeration" />[59] </td><td><code>Enumeration</code></td><td> ::= </td><td><code>'(' <a href="#NT-S">S</a>? <a href="#NT-Nmtoken">Nmtoken</a> | 738 | PIs (i.e. markup matching production [27] <a href="#NT-Misc">Misc</a>) between the |
739 | (<a href="#NT-S">S</a>? '|' <a href="#NT-S">S</a>? <a href="#NT-Nmtoken">Nmtoken</a>)* <a href="#NT-S">S</a>? ')'</code></td><td><a href="#enum">[VC: Enumeration]</a></td></tr><tr valign="baseline"><td /><td /><td /><td /><td><a href="#NoDuplicateTokens">[VC: No Duplicate | 739 | start-tag and the first child element, between child elements, or between |
740 | Tokens]</a></td></tr></tbody></table><p>A <b>NOTATION</b> attribute identifies a <a title="Notation" href="#dt-notation">notation</a>, | 740 | the last child element and the end-tag. Note that a CDATA section containing |
741 | declared in the DTD with associated system and/or public identifiers, to be | 741 | only white space or a reference |
742 | used in interpreting the element to which the attribute is attached.</p><div class="constraint"><p class="prefix"><a name="notatn" id="notatn" /><b>Validity constraint: Notation Attributes</b></p><p>Values of this type | 742 | to an entity whose replacement text is character references expanding to white |
743 | <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> match one of the <a href="#Notations"><cite>notation</cite></a> names | 743 | space do not |
744 | included in the declaration; all notation names in the declaration <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> be | 744 | match the nonterminal <a href="#NT-S">S</a>, and |
745 | declared.</p></div><div class="constraint"><p class="prefix"><a name="OneNotationPer" id="OneNotationPer" /><b>Validity constraint: One Notation Per Element Type</b></p><p><span class="mustard">An element type <em class="rfc2119" title="Keyword in RFC 2119 context">MUST NOT</em></span> have more than one <b>NOTATION</b> | 745 | hence cannot appear in these positions; however, a |
746 | attribute specified.</p></div><div class="constraint"><p class="prefix"><a name="NoNotationEmpty" id="NoNotationEmpty" /><b>Validity constraint: No Notation on Empty Element</b></p><p><a title="For Compatibility" href="#dt-compat">For compatibility</a>, | 746 | reference to an internal entity with a literal value consisting of character |
747 | an attribute of type <b>NOTATION</b> <em class="rfc2119" title="Keyword in RFC 2119 context">MUST NOT</em> be declared on an element | 747 | references expanding to white space does match <a href="#NT-S">S</a>, since its |
748 | declared <b>EMPTY</b>.</p></div><div class="constraint"><p class="prefix"><a name="NoDuplicateTokens" id="NoDuplicateTokens" /><b>Validity constraint: No Duplicate | 748 | replacement text is the white space resulting from expansion of the character |
749 | Tokens</b></p><p>The notation names in a single <a href="#NT-NotationType">NotationType</a> | 749 | references.</p></li><li><p>The declaration matches <a href="#NT-Mixed">Mixed</a> |
750 | attribute declaration, as well as the <a href="#NT-Nmtoken">NmToken</a>s in a single | 750 | <span> |
751 | <a href="#NT-Enumeration">Enumeration</a> attribute declaration, <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> all be distinct.</p></div><div class="constraint"><p class="prefix"><a name="enum" id="enum" /><b>Validity constraint: Enumeration</b></p><p>Values of this type <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> match | 751 | ,</span> and the content |
752 | one of the <a href="#NT-Nmtoken">Nmtoken</a> tokens in the declaration.</p></div><p><a title="For interoperability" href="#dt-interop">For interoperability,</a> the same <a href="#NT-Nmtoken">Nmtoken</a> <em class="rfc2119" title="Keyword in RFC 2119 context">SHOULD NOT</em> occur more than once in the enumerated | 752 | (after replacing |
753 | attribute types of a single element type.</p></div><div class="div3"> <h4><a name="sec-attr-defaults" id="sec-attr-defaults" />3.3.2 Attribute Defaults</h4><p>An <a title="Attribute-List Declaration" href="#dt-attdecl">attribute declaration</a> provides information | 753 | any entity references with their replacement text) consists of |
754 | on whether the attribute's presence is <em class="rfc2119" title="Keyword in RFC 2119 context">REQUIRED</em>, and if not, how an XML processor | 754 | <a title="Character Data" href="#dt-chardata">character data</a> |
755 | <span>is | 755 | <span> |
756 | to</span> react if a declared attribute is absent in a document.</p> <h5><a name="IDAR4CU" id="IDAR4CU" />Attribute Defaults</h5><table class="scrap" summary="Scrap"><tbody><tr valign="baseline"><td><a name="NT-DefaultDecl" id="NT-DefaultDecl" />[60] </td><td><code>DefaultDecl</code></td><td> ::= </td><td><code>'#REQUIRED' | '#IMPLIED' </code></td></tr><tr valign="baseline"><td /><td /><td /><td><code>| (('#FIXED' S)? <a href="#NT-AttValue">AttValue</a>)</code></td><td><a href="#RequiredAttr">[VC: Required Attribute]</a></td></tr><tr valign="baseline"><td /><td /><td /><td /><td><a href="#defattrvalid">[VC: Attribute | 756 | (including <a title="CDATA Section" href="#dt-cdsection">CDATA sections</a>)</span>, |
757 | Default Value Syntactically Correct]</a></td></tr><tr valign="baseline"><td /><td /><td /><td /><td><a href="#CleanAttrVals">[WFC: No < in Attribute Values]</a></td></tr><tr valign="baseline"><td /><td /><td /><td /><td><a href="#FixedAttr">[VC: Fixed Attribute Default]</a></td></tr></tbody></table><p>In an attribute declaration, <b>#REQUIRED</b> means that the attribute | 757 | <a title="Comment" href="#dt-comment">comments</a>, <a title="Processing instruction" href="#dt-pi">PIs</a> and <a title="Parent/Child" href="#dt-parentchild">child elements</a> whose types match names in the |
758 | <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> always be provided, <b>#IMPLIED</b> that no default value is provided. | 758 | content model.</p></li><li><p>The declaration matches <b>ANY</b>, and the |
759 | [<a name="dt-default" id="dt-default" title="Attribute Default">Definition</a>: If | 759 | content (after replacing |
760 | the declaration is neither <b>#REQUIRED</b> nor <b>#IMPLIED</b>, then | 760 | any entity references with their replacement text) |
761 | the <a href="#NT-AttValue">AttValue</a> value contains the declared <b>default</b> | 761 | consists of character data<span> |
762 | value; the <b>#FIXED</b> keyword states that the attribute <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> always have | 762 | , <a title="CDATA Section" href="#dt-cdsection">CDATA |
763 | the default value. | 763 | sections</a>, <a title="Comment" href="#dt-comment">comments</a>, <a title="Processing instruction" href="#dt-pi">PIs</a> |
764 | When an XML processor encounters | 764 | </span> and <a title="Parent/Child" href="#dt-parentchild">child elements</a> |
765 | an <span>element | 765 | whose types have been declared.</p></li></ol></div><div class="div2"> <h3><a name="sec-starttags" id="sec-starttags"/>3.1 Start-Tags, End-Tags, and Empty-Element Tags</h3><p> |
766 | without a specification for an attribute for which it has read a default | 766 | [<a name="dt-stag" id="dt-stag" title="Start-Tag">Definition</a>: The beginning of every non-empty |
767 | value declaration, it <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> report the attribute with the declared default | 767 | XML element is marked by a <b>start-tag</b>.] |
768 | value to the application</span>.]</p><div class="constraint"><p class="prefix"><a name="RequiredAttr" id="RequiredAttr" /><b>Validity constraint: Required Attribute</b></p><p>If the default | 768 | </p> <h5><a name="IDA10FS" id="IDA10FS"/>Start-tag</h5><table class="scrap" summary="Scrap"><tbody><tr valign="baseline"><td><a name="NT-STag" id="NT-STag"/>[40] </td><td><code>STag</code></td><td> ::= </td><td><code>'<' <a href="#NT-Name">Name</a> (<a href="#NT-S">S</a> |
769 | declaration is the keyword <b>#REQUIRED</b>, then the attribute <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> be | 769 | <a href="#NT-Attribute">Attribute</a>)* <a href="#NT-S">S</a>? '>'</code></td><td><a href="#uniqattspec">[WFC: Unique Att Spec]</a></td></tr><tr valign="baseline"><td><a name="NT-Attribute" id="NT-Attribute"/>[41] </td><td><code>Attribute</code></td><td> ::= </td><td><code> |
770 | specified for all elements of the type in the attribute-list declaration.</p></div><div class="constraint"><p class="prefix"><a name="defattrvalid" id="defattrvalid" /><b>Validity constraint: <span>Attribute | 770 | <a href="#NT-Name">Name</a> |
771 | Default Value Syntactically Correct</span></b></p><p>The declared default value <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> meet the <span>syntactic</span> | 771 | <a href="#NT-Eq">Eq</a> |
772 | constraints of the declared attribute type.</p><p>Note that only the | 772 | <a href="#NT-AttValue">AttValue</a> |
773 | syntactic constraints of the type are required here; other constraints (e.g. | 773 | </code></td><td><a href="#ValueType">[VC: Attribute Value Type]</a></td></tr><tr valign="baseline"><td/><td/><td/><td/><td><a href="#NoExternalRefs">[WFC: No External Entity References]</a></td></tr><tr valign="baseline"><td/><td/><td/><td/><td><a href="#CleanAttrVals">[WFC: No < in Attribute Values]</a></td></tr></tbody></table><p>The <a href="#NT-Name">Name</a> in the start- and end-tags gives the element's <b>type</b>. [<a name="dt-attr" id="dt-attr" title="Attribute">Definition</a>: The <a href="#NT-Name">Name</a>-<a href="#NT-AttValue">AttValue</a> |
774 | that the value be the name of a declared unparsed entity, for an attribute of | 774 | pairs are referred to as the <b>attribute specifications</b> of the |
775 | type ENTITY) may come into play if the declared default value is actually used | 775 | element], [<a name="dt-attrname" id="dt-attrname" title="Attribute Name">Definition</a>: with the <a href="#NT-Name">Name</a> in each pair referred to as the <b>attribute name</b> |
776 | (an element without a specification for this attribute occurs).</p></div><div class="constraint"><p class="prefix"><a name="FixedAttr" id="FixedAttr" /><b>Validity constraint: Fixed Attribute Default</b></p><p>If an attribute | 776 | ] |
777 | has a default value declared with the <b>#FIXED</b> keyword, instances of | 777 | and [<a name="dt-attrval" id="dt-attrval" title="Attribute Value">Definition</a>: the content of the <a href="#NT-AttValue">AttValue</a> (the text between the <code>'</code> or <code>"</code> |
778 | that attribute <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> match the default value.</p></div><p>Examples of attribute-list declarations:</p><div class="exampleInner"><pre><!ATTLIST termdef | 778 | delimiters) as the <b>attribute value</b>.] Note |
779 | id ID #REQUIRED | 779 | that the order of attribute specifications in a start-tag or empty-element |
780 | name CDATA #IMPLIED> | 780 | tag is not significant.</p><div class="constraint"><p class="prefix"><a name="uniqattspec" id="uniqattspec"/><b>Well-formedness constraint: Unique Att Spec</b></p><p>An attribute name |
781 | <!ATTLIST list | 781 | <em class="rfc2119" title="Keyword in RFC 2119 context">MUST NOT</em> appear more than once in the same start-tag or empty-element tag.</p></div><div class="constraint"><p class="prefix"><a name="ValueType" id="ValueType"/><b>Validity constraint: Attribute Value Type</b></p><p>The attribute <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> |
782 | type (bullets|ordered|glossary) "ordered"> | 782 | have been declared; the value <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> be of the type declared for it. (For attribute |
783 | <!ATTLIST form | 783 | types, see <a href="#attdecls"><b>3.3 Attribute-List Declarations</b></a>.)</p></div><div class="constraint"><p class="prefix"><a name="NoExternalRefs" id="NoExternalRefs"/><b>Well-formedness constraint: No External Entity References</b></p><p>Attribute |
784 | method CDATA #FIXED "POST"></pre></div></div><div class="div3"> <h4><a name="AVNormalize" id="AVNormalize" />3.3.3 Attribute-Value Normalization</h4><p>Before the value of an attribute is passed to the application or checked | 784 | values <em class="rfc2119" title="Keyword in RFC 2119 context">MUST NOT</em> contain direct or indirect entity references to external entities.</p></div><div class="constraint"><p class="prefix"><a name="CleanAttrVals" id="CleanAttrVals"/><b>Well-formedness constraint: No <code><</code> in Attribute Values</b></p><p>The <a title="Replacement Text" href="#dt-repltext">replacement text</a> of any entity |
785 | for validity, the XML processor <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> normalize the attribute value by applying | 785 | referred to directly or indirectly in an attribute value <em class="rfc2119" title="Keyword in RFC 2119 context">MUST NOT</em> contain a <code><</code>.</p></div><p>An example of a start-tag:</p><div class="exampleInner"><pre><termdef id="dt-dog" term="dog"></pre></div><p> |
786 | the algorithm below, or by using some other method such that the value passed | 786 | [<a name="dt-etag" id="dt-etag" title="End Tag">Definition</a>: The end of every element that begins |
787 | to the application is the same as that produced by the algorithm.</p><ol type="1"><li><p>All line breaks <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> have been normalized on input to #xA as described | 787 | with a start-tag <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> be marked by an <b>end-tag</b> containing a name |
788 | in <a href="#sec-line-ends"><b>2.11 End-of-Line Handling</b></a>, so the rest of this algorithm operates | 788 | that echoes the element's type as given in the start-tag:] |
789 | on text normalized in this way.</p></li><li><p>Begin with a normalized value consisting of the empty string.</p></li><li><p>For each character, entity reference, or character reference in the | 789 | </p> <h5><a name="IDAZIIS" id="IDAZIIS"/>End-tag</h5><table class="scrap" summary="Scrap"><tbody><tr valign="baseline"><td><a name="NT-ETag" id="NT-ETag"/>[42] </td><td><code>ETag</code></td><td> ::= </td><td><code>'</' <a href="#NT-Name">Name</a> |
790 | unnormalized attribute value, beginning with the first and continuing to the | 790 | <a href="#NT-S">S</a>? |
791 | last, do the following:</p><ul><li><p>For a character reference, append the referenced character to the | 791 | '>'</code></td></tr></tbody></table><p>An example of an end-tag:</p><div class="exampleInner"><pre></termdef></pre></div><p> |
792 | normalized value.</p></li><li><p>For an entity reference, recursively apply step 3 of this algorithm | 792 | [<a name="dt-content" id="dt-content" title="Content">Definition</a>: The <a title="Text" href="#dt-text">text</a> |
793 | to the replacement text of the entity.</p></li><li><p>For a white space character (#x20, #xD, #xA, #x9), append a space | 793 | between the start-tag and end-tag is called the element's <b>content</b>:] |
794 | character (#x20) to the normalized value.</p></li><li><p>For another character, append the character to the normalized value.</p></li></ul></li></ol><p>If the attribute type is not CDATA, then the XML processor <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> further | 794 | </p> <h5><a name="IDAHKIS" id="IDAHKIS"/>Content of Elements</h5><table class="scrap" summary="Scrap"><tbody><tr valign="baseline"><td><a name="NT-content" id="NT-content"/>[43] </td><td><code>content</code></td><td> ::= </td><td><code> |
795 | process the normalized attribute value by discarding any leading and trailing | 795 | <a href="#NT-CharData">CharData</a>? ((<a href="#NT-element">element</a> |
796 | space (#x20) characters, and by replacing sequences of space (#x20) characters | 796 | | <a href="#NT-Reference">Reference</a> | <a href="#NT-CDSect">CDSect</a> |
797 | by a single space (#x20) character.</p><p>Note that if the unnormalized attribute value contains a character reference | 797 | | <a href="#NT-PI">PI</a> | <a href="#NT-Comment">Comment</a>) <a href="#NT-CharData">CharData</a>?)*</code></td></tr></tbody></table><p> |
798 | to a white space character other than space (#x20), the normalized value contains | 798 | [<a name="dt-empty" id="dt-empty" title="Empty">Definition</a>: An element |
799 | the referenced character itself (#xD, #xA or #x9). This contrasts with the | 799 | with no <a href="#NT-content">content</a> is said to be <b>empty</b>.] The representation |
800 | case where the unnormalized value contains a white space character (not a | 800 | of an empty element is either a start-tag immediately followed by an end-tag, |
801 | reference), which is replaced with a space character (#x20) in the normalized | 801 | or an empty-element tag. [<a name="dt-eetag" id="dt-eetag" title="empty-element tag">Definition</a>: An <b>empty-element |
802 | value and also contrasts with the case where the unnormalized value contains | 802 | tag</b> takes a special form:] |
803 | an entity reference whose replacement text contains a white space character; | 803 | </p> <h5><a name="IDAMMIS" id="IDAMMIS"/>Tags for Empty Elements</h5><table class="scrap" summary="Scrap"><tbody><tr valign="baseline"><td><a name="NT-EmptyElemTag" id="NT-EmptyElemTag"/>[44] </td><td><code>EmptyElemTag</code></td><td> ::= </td><td><code>'<' <a href="#NT-Name">Name</a> (<a href="#NT-S">S</a> |
804 | being recursively processed, the white space character is replaced with a | 804 | <a href="#NT-Attribute">Attribute</a>)* <a href="#NT-S">S</a>? '/>'</code></td><td><a href="#uniqattspec">[WFC: Unique Att Spec]</a></td></tr></tbody></table><p>Empty-element tags <span>may</span> be used for any element which has no content, whether |
805 | space character (#x20) in the normalized value.</p><p>All attributes for which no declaration has been read <em class="rfc2119" title="Keyword in RFC 2119 context">SHOULD</em> be treated | 805 | or not it is declared using the keyword <b>EMPTY</b>. <a title="For interoperability" href="#dt-interop">For |
806 | by a non-validating processor as if declared <b>CDATA</b>.</p><p>It | 806 | interoperability</a>, the empty-element tag <em class="rfc2119" title="Keyword in RFC 2119 context">SHOULD</em> |
807 | is an error if an | 807 | be used, and <em class="rfc2119" title="Keyword in RFC 2119 context">SHOULD</em> only be used, for elements which are declared |
808 | <span><a title="Attribute Value" href="#dt-attrval">attribute | 808 | EMPTY.</p><p>Examples of empty elements:</p><div class="exampleInner"><pre><IMG align="left" |
809 | value</a> contains a <a title="Entity Reference" href="#dt-entref">reference</a> to an | 809 | src="http://www.w3.org/Icons/WWW/w3c_home" /> |
810 | entity for which no declaration has been read.</span></p><p>Following are examples of attribute normalization. Given the following | 810 | <br></br> |
811 | declarations:</p><div class="exampleInner"><pre><!ENTITY d "&#xD;"> | 811 | <br/></pre></div></div><div class="div2"> <h3><a name="elemdecls" id="elemdecls"/>3.2 Element Type Declarations</h3><p>The <a title="Element" href="#dt-element">element</a> structure of an <a title="XML Document" href="#dt-xml-doc">XML document</a> <span>may</span>, for <a title="Validity" href="#dt-valid">validation</a> |
812 | <!ENTITY a "&#xA;"> | 812 | purposes, be constrained using element type and attribute-list declarations. |
813 | <!ENTITY da "&#xD;&#xA;"></pre></div><p>the attribute specifications in the left column below would be normalized | 813 | An element type declaration constrains the element's <a title="Content" href="#dt-content">content</a>.</p><p>Element type declarations often constrain which element types can appear |
814 | to the character sequences of the middle column if the attribute <code>a</code> | 814 | as <a title="Parent/Child" href="#dt-parentchild">children</a> of the element. At user |
815 | is declared <b>NMTOKENS</b> and to those of the right columns if <code>a</code> | 815 | option, an XML processor <em class="rfc2119" title="Keyword in RFC 2119 context">MAY</em> issue a warning when a declaration mentions an |
816 | is declared <b>CDATA</b>.</p><table border="1" frame="border" summary="Attribute normalization summary"><thead><tr><th rowspan="1" colspan="1">Attribute specification</th><th rowspan="1" colspan="1">a is NMTOKENS</th><th rowspan="1" colspan="1">a is CDATA</th></tr></thead><tbody><tr><td rowspan="1" colspan="1"><div class="exampleInner"><pre>a=" | 816 | element type for which no declaration is provided, but this is not an error.</p><p> |
817 | xyz"</pre></div></td><td rowspan="1" colspan="1"><div class="exampleInner"><pre>x y z</pre></div></td><td rowspan="1" colspan="1"><div class="exampleInner"><pre>#x20 #x20 x y z</pre></div></td></tr><tr><td rowspan="1" colspan="1"><div class="exampleInner"><pre>a="&d;&d;A&a;<span>&#x20;</span>&a;B&da;"</pre></div></td><td rowspan="1" colspan="1"><div class="exampleInner"><pre>A #x20 B</pre></div></td><td rowspan="1" colspan="1"><div class="exampleInner"><pre>#x20 #x20 A #x20 <span>#x20</span> #x20 B #x20 #x20</pre></div></td></tr><tr><td rowspan="1" colspan="1"><div class="exampleInner"><pre>a= | 817 | [<a name="dt-eldecl" id="dt-eldecl" title="Element Type declaration">Definition</a>: An <b>element |
818 | "&#xd;&#xd;A&#xa;&#xa;B&#xd;&#xa;"</pre></div></td><td rowspan="1" colspan="1"><div class="exampleInner"><pre>#xD #xD A #xA #xA B #xD #xA</pre></div></td><td rowspan="1" colspan="1"><div class="exampleInner"><pre>#xD #xD A #xA #xA B #xD #xA</pre></div></td></tr></tbody></table><p>Note that the last example is invalid (but well-formed) if <code>a</code> | 818 | type declaration</b> takes the form:] |
819 | is declared to be of type <b>NMTOKENS</b>.</p></div></div><div class="div2"> <h3><a name="sec-condition-sect" id="sec-condition-sect" />3.4 Conditional Sections</h3><p>[<a name="dt-cond-section" id="dt-cond-section" title="conditional section">Definition</a>: <b>Conditional | 819 | </p> <h5><a name="IDAYPIS" id="IDAYPIS"/>Element Type Declaration</h5><table class="scrap" summary="Scrap"><tbody><tr valign="baseline"><td><a name="NT-elementdecl" id="NT-elementdecl"/>[45] </td><td><code>elementdecl</code></td><td> ::= </td><td><code>'<!ELEMENT' <a href="#NT-S">S</a> |
820 | sections</b> are portions of the <a title="Document Type Declaration" href="#dt-doctype">document type | 820 | <a href="#NT-Name">Name</a> |
821 | declaration external subset</a> <span>or | 821 | <a href="#NT-S">S</a> |
822 | of external parameter entities </span>which are included in, or excluded from, | 822 | <a href="#NT-contentspec">contentspec</a> |
823 | the logical structure of the DTD based on the keyword which governs them.]</p> <h5><a name="IDAMHDU" id="IDAMHDU" />Conditional Section</h5><table class="scrap" summary="Scrap"><tbody><tr valign="baseline"><td><a name="NT-conditionalSect" id="NT-conditionalSect" />[61] </td><td><code>conditionalSect</code></td><td> ::= </td><td><code><a href="#NT-includeSect">includeSect</a> | <a href="#NT-ignoreSect">ignoreSect</a></code></td></tr><tr valign="baseline"><td><a name="NT-includeSect" id="NT-includeSect" />[62] </td><td><code>includeSect</code></td><td> ::= </td><td><code>'<![' S? 'INCLUDE' S? '[' <a href="#NT-extSubsetDecl">extSubsetDecl</a> | 823 | <a href="#NT-S">S</a>? |
824 | ']]>' </code></td><td><a href="#condsec-nesting">[VC: Proper Conditional Section/PE Nesting]</a></td></tr><tr valign="baseline"><td><a name="NT-ignoreSect" id="NT-ignoreSect" />[63] </td><td><code>ignoreSect</code></td><td> ::= </td><td><code>'<![' S? 'IGNORE' S? '[' <a href="#NT-ignoreSectContents">ignoreSectContents</a>* | 824 | '>'</code></td><td><a href="#EDUnique">[VC: Unique Element Type Declaration]</a></td></tr><tr valign="baseline"><td><a name="NT-contentspec" id="NT-contentspec"/>[46] </td><td><code>contentspec</code></td><td> ::= </td><td><code>'EMPTY' | 'ANY' | <a href="#NT-Mixed">Mixed</a> |
825 | ']]>'</code></td><td><a href="#condsec-nesting">[VC: Proper Conditional Section/PE Nesting]</a></td></tr><tr valign="baseline"><td><a name="NT-ignoreSectContents" id="NT-ignoreSectContents" />[64] </td><td><code>ignoreSectContents</code></td><td> ::= </td><td><code><a href="#NT-Ignore">Ignore</a> ('<![' <a href="#NT-ignoreSectContents">ignoreSectContents</a> ']]>' <a href="#NT-Ignore">Ignore</a>)*</code></td></tr><tr valign="baseline"><td><a name="NT-Ignore" id="NT-Ignore" />[65] </td><td><code>Ignore</code></td><td> ::= </td><td><code><a href="#NT-Char">Char</a>* - (<a href="#NT-Char">Char</a>* | 825 | | <a href="#NT-children">children</a> |
826 | ('<![' | ']]>') <a href="#NT-Char">Char</a>*) </code></td></tr></tbody></table><div class="constraint"><p class="prefix"><a name="condsec-nesting" id="condsec-nesting" /><b>Validity constraint: Proper Conditional Section/PE Nesting</b></p><p>If any of the "<code><![</code>", | 826 | </code></td></tr></tbody></table><p>where the <a href="#NT-Name">Name</a> gives the element type being declared.</p><div class="constraint"><p class="prefix"><a name="EDUnique" id="EDUnique"/><b>Validity constraint: Unique Element Type Declaration</b></p><p>An element type <em class="rfc2119" title="Keyword in RFC 2119 context">MUST NOT</em> be declared more than once.</p></div><p>Examples of element type declarations:</p><div class="exampleInner"><pre><!ELEMENT br EMPTY> |
827 | "<code>[</code>", or "<code>]]></code>" of a conditional section is contained | 827 | <!ELEMENT p (#PCDATA|emph)* > |
828 | in the replacement text for a parameter-entity reference, all of them <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> | 828 | <!ELEMENT %name.para; %content.para; > |
829 | be contained in the same replacement text.</p></div><p>Like the internal and external DTD subsets, a conditional section may contain | 829 | <!ELEMENT container ANY></pre></div><div class="div3"> <h4><a name="sec-element-content" id="sec-element-content"/>3.2.1 Element Content</h4><p> |
830 | one or more complete declarations, comments, processing instructions, or nested | 830 | [<a name="dt-elemcontent" id="dt-elemcontent" title="Element content">Definition</a>: An element <a title="Start-Tag" href="#dt-stag">type</a> has <b>element content</b> when elements |
831 | conditional sections, intermingled with white space.</p><p>If the keyword of the conditional section is <b>INCLUDE</b>, then the | 831 | of that type <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> contain only <a title="Parent/Child" href="#dt-parentchild">child</a> |
832 | contents of the conditional section <span class="mustard"><em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> be considered</span> part of the DTD. If the keyword of | 832 | elements (no character data), optionally separated by white space (characters |
833 | the conditional section is <b>IGNORE</b>, then the contents of the conditional | 833 | matching the nonterminal <a href="#NT-S">S</a>).] |
834 | section <span class="mustard"><em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> be considered as</span> not logically part of the DTD. | 834 | [<a name="dt-content-model" id="dt-content-model" title="Content model">Definition</a>: In this case, the constraint includes a <b>content |
835 | If a conditional section with a keyword of <b>INCLUDE</b> occurs within | 835 | model</b>, a simple grammar governing the allowed types of the |
836 | a larger conditional section with a keyword of <b>IGNORE</b>, both the outer | 836 | child elements and the order in which they are allowed to appear.] |
837 | and the inner conditional sections <span class="mustard"><em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> be</span> ignored. The contents | 837 | The grammar is built on content particles (<a href="#NT-cp">cp</a>s), which |
838 | of an ignored conditional section <span class="mustard"><em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> be</span> parsed by ignoring all characters after | 838 | consist of names, choice lists of content particles, or sequence lists of |
839 | the "<code>[</code>" following the keyword, except conditional section starts | 839 | content particles:</p> <h5><a name="IDATTIS" id="IDATTIS"/>Element-content Models</h5><table class="scrap" summary="Scrap"><tbody><tr valign="baseline"><td><a name="NT-children" id="NT-children"/>[47] </td><td><code>children</code></td><td> ::= </td><td><code>(<a href="#NT-choice">choice</a> | <a href="#NT-seq">seq</a>) |
840 | "<code><![</code>" and ends "<code>]]></code>", until the matching conditional | 840 | ('?' | '*' | '+')?</code></td></tr><tr valign="baseline"><td><a name="NT-cp" id="NT-cp"/>[48] </td><td><code>cp</code></td><td> ::= </td><td><code>(<a href="#NT-Name">Name</a> | <a href="#NT-choice">choice</a> |
841 | section end is found. Parameter entity references <span class="mustard"><em class="rfc2119" title="Keyword in RFC 2119 context">MUST NOT</em> be</span> recognized in this | 841 | | <a href="#NT-seq">seq</a>) ('?' | '*' | '+')?</code></td></tr><tr valign="baseline"><td><a name="NT-choice" id="NT-choice"/>[49] </td><td><code>choice</code></td><td> ::= </td><td><code>'(' <a href="#NT-S">S</a>? <a href="#NT-cp">cp</a> ( <a href="#NT-S">S</a>? '|' <a href="#NT-S">S</a>? <a href="#NT-cp">cp</a> )+ <a href="#NT-S">S</a>? ')'</code></td><td><a href="#vc-PEinGroup">[VC: Proper Group/PE Nesting]</a></td></tr><tr valign="baseline"><td><a name="NT-seq" id="NT-seq"/>[50] </td><td><code>seq</code></td><td> ::= </td><td><code>'(' <a href="#NT-S">S</a>? <a href="#NT-cp">cp</a> ( <a href="#NT-S">S</a>? ',' <a href="#NT-S">S</a>? <a href="#NT-cp">cp</a> )* <a href="#NT-S">S</a>? ')'</code></td><td><a href="#vc-PEinGroup">[VC: Proper Group/PE Nesting]</a></td></tr></tbody></table><p>where each <a href="#NT-Name">Name</a> is the type of an element which |
842 | process.</p><p>If the keyword of the conditional section is a parameter-entity reference, | 842 | <span>may</span> appear as a <a title="Parent/Child" href="#dt-parentchild">child</a>. Any content |
843 | the parameter entity <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> be replaced by its content before the processor | 843 | particle in a choice list <span>may</span> appear in the <a title="Element content" href="#dt-elemcontent">element |
844 | decides whether to include or ignore the conditional section.</p><p>An example:</p><div class="exampleInner"><pre><!ENTITY % draft 'INCLUDE' > | 844 | content</a> at the location where the choice list appears in the grammar; |
845 | <!ENTITY % final 'IGNORE' > | 845 | content particles occurring in a sequence list <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> each appear in the <a title="Element content" href="#dt-elemcontent">element content</a> in the order given in the list. |
846 | <![%draft;[ | 846 | The optional character following a name or list governs whether the element |
847 | <!ELEMENT book (comments*, title, body, supplements?)> | 847 | or the content particles in the list may occur one or more (<code>+</code>), |
848 | ]]> | 848 | zero or more (<code>*</code>), or zero or one times (<code>?</code>). The |
849 | <![%final;[ | 849 | absence of such an operator means that the element or content particle <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> |
850 | <!ELEMENT book (title, body, supplements?)> | 850 | appear exactly once. This syntax and meaning are identical to those used in |
851 | ]]></pre></div></div></div><div class="div1"> <h2><a name="sec-physical-struct" id="sec-physical-struct" />4 Physical Structures</h2><p>[<a name="dt-entity" id="dt-entity" title="Entity">Definition</a>: An XML document may consist of one | 851 | the productions in this specification.</p><p>The content of an element matches a content model if and only if it is |
852 | or many storage units. These | 852 | possible to trace out a path through the content model, obeying the sequence, |
853 | are called <b>entities</b>; they all have <b>content</b> and are | 853 | choice, and repetition operators and matching each element in the content |
854 | all (except for the <a title="Document Entity" href="#dt-docent">document entity</a> and | 854 | against an element type in the content model. <a title="For Compatibility" href="#dt-compat">For |
855 | the <a title="Document Type Declaration" href="#dt-doctype">external DTD subset</a>) identified by | 855 | compatibility</a>, it is an error if the content model |
856 | entity <b>name</b>.] Each XML document has one entity | 856 | allows an element to match more than one occurrence of an element type in the |
857 | called the <a title="Document Entity" href="#dt-docent">document entity</a>, which serves | 857 | content model. For more information, see <a href="#determinism"><b>D Deterministic Content Models</b></a>.</p><div class="constraint"><p class="prefix"><a name="vc-PEinGroup" id="vc-PEinGroup"/><b>Validity constraint: Proper Group/PE Nesting</b></p><p>Parameter-entity <a title="Replacement Text" href="#dt-repltext">replacement text</a> |
858 | as the starting point for the <a title="XML Processor" href="#dt-xml-proc">XML processor</a> | 858 | <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> be properly nested with parenthesized |
859 | and may contain the whole document.</p><p>Entities may be either parsed or unparsed. [<a name="dt-parsedent" id="dt-parsedent" title="Text Entity">Definition</a>: The contents of a <b>parsed | 859 | groups. That is to say, if either of the opening or closing parentheses in |
860 | entity</b> are referred to as its <a title="Replacement Text" href="#dt-repltext">replacement | 860 | a <a href="#NT-choice">choice</a>, <a href="#NT-seq">seq</a>, or <a href="#NT-Mixed">Mixed</a> |
861 | text</a>; this <a title="Text" href="#dt-text">text</a> is considered an | 861 | construct is contained in the replacement text for a <a title="Parameter-entity reference" href="#dt-PERef">parameter |
862 | integral part of the document.]</p><p>[<a name="dt-unparsed" id="dt-unparsed" title="Unparsed Entity">Definition</a>: An <b>unparsed entity</b> | 862 | entity</a>, both <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> be contained in the same replacement text.</p><p> |
863 | is a resource whose contents may or may not be <a title="Text" href="#dt-text">text</a>, | 863 | <a title="For interoperability" href="#dt-interop">For interoperability</a>, if a parameter-entity reference |
864 | and if text, may | 864 | appears in a <a href="#NT-choice">choice</a>, <a href="#NT-seq">seq</a>, or <a href="#NT-Mixed">Mixed</a> construct, its replacement text <em class="rfc2119" title="Keyword in RFC 2119 context">SHOULD</em> contain at |
865 | be other than XML. Each unparsed entity has an associated <a title="Notation" href="#dt-notation">notation</a>, identified by name. Beyond a requirement | 865 | least one non-blank character, and neither the first nor last non-blank character |
866 | that an XML processor make the identifiers for the entity and notation available | 866 | of the replacement text <em class="rfc2119" title="Keyword in RFC 2119 context">SHOULD</em> be a connector (<code>|</code> or <code>,</code>).</p></div><p>Examples of element-content models:</p><div class="exampleInner"><pre><!ELEMENT spec (front, body, back?)> |
867 | to the application, XML places no constraints on the contents of unparsed | 867 | <!ELEMENT div1 (head, (p | list | note)*, div2*)> |
868 | entities.]</p><p>Parsed entities are invoked by name using entity references; unparsed entities | 868 | <!ELEMENT dictionary-body (%div.mix; | %dict.mix;)*></pre></div></div><div class="div3"> <h4><a name="sec-mixed-content" id="sec-mixed-content"/>3.2.2 Mixed Content</h4><p> |
869 | by name, given in the value of <b>ENTITY</b> or <b>ENTITIES</b> attributes.</p><p>[<a name="gen-entity" id="gen-entity" title="general entity">Definition</a>: <b>General entities</b> | 869 | [<a name="dt-mixed" id="dt-mixed" title="Mixed Content">Definition</a>: An element <a title="Start-Tag" href="#dt-stag">type</a> |
870 | are entities for use within the document content. In this specification, general | 870 | has <b>mixed content</b> when elements of that type <span>may</span> contain character |
871 | entities are sometimes referred to with the unqualified term <em>entity</em> | 871 | data, optionally interspersed with <a title="Parent/Child" href="#dt-parentchild">child</a> |
872 | when this leads to no ambiguity.] [<a name="dt-PE" id="dt-PE" title="Parameter entity">Definition</a>: <b>Parameter | 872 | elements.] In this case, the types of the child elements <span>may</span> be constrained, |
873 | entities</b> are parsed entities for use within the DTD.] | 873 | but not their order or their number of occurrences:</p> <h5><a name="IDAC2IS" id="IDAC2IS"/>Mixed-content Declaration</h5><table class="scrap" summary="Scrap"><tbody><tr valign="baseline"><td><a name="NT-Mixed" id="NT-Mixed"/>[51] </td><td><code>Mixed</code></td><td> ::= </td><td><code>'(' <a href="#NT-S">S</a>? '#PCDATA' (<a href="#NT-S">S</a>? |
874 | These two types of entities use different forms of reference and are recognized | 874 | '|' <a href="#NT-S">S</a>? <a href="#NT-Name">Name</a>)* <a href="#NT-S">S</a>? |
875 | in different contexts. Furthermore, they occupy different namespaces; a parameter | 875 | ')*' </code></td></tr><tr valign="baseline"><td/><td/><td/><td><code>| '(' <a href="#NT-S">S</a>? '#PCDATA' <a href="#NT-S">S</a>? ')' </code></td><td><a href="#vc-PEinGroup">[VC: Proper Group/PE Nesting]</a></td></tr><tr valign="baseline"><td/><td/><td/><td/><td><a href="#vc-MixedChildrenUnique">[VC: No Duplicate Types]</a></td></tr></tbody></table><p>where the <a href="#NT-Name">Name</a>s give the types of elements that |
876 | entity and a general entity with the same name are two distinct entities.</p><div class="div2"> <h3><a name="sec-references" id="sec-references" />4.1 Character and Entity References</h3><p>[<a name="dt-charref" id="dt-charref" title="Character Reference">Definition</a>: A <b>character | 876 | may appear as children. The |
877 | reference</b> refers to a specific character in the ISO/IEC 10646 character | 877 | keyword <b>#PCDATA</b> derives historically from the term "parsed |
878 | set, for example one not directly accessible from available input devices.]</p> <h5><a name="IDAFYDU" id="IDAFYDU" />Character Reference</h5><table class="scrap" summary="Scrap"><tbody><tr valign="baseline"><td><a name="NT-CharRef" id="NT-CharRef" />[66] </td><td><code>CharRef</code></td><td> ::= </td><td><code>'&#' [0-9]+ ';' </code></td></tr><tr valign="baseline"><td /><td /><td /><td><code>| '&#x' [0-9a-fA-F]+ ';'</code></td><td><a href="#wf-Legalchar">[WFC: Legal Character]</a></td></tr></tbody></table><div class="constraint"><p class="prefix"><a name="wf-Legalchar" id="wf-Legalchar" /><b>Well-formedness constraint: Legal Character</b></p><p>Characters referred | 878 | character data." |
879 | to using character references <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> match the production for <a href="#NT-Char">Char</a>.</p></div><p>If the character reference begins with "<code>&#x</code>", | 879 | </p><div class="constraint"><p class="prefix"><a name="vc-MixedChildrenUnique" id="vc-MixedChildrenUnique"/><b>Validity constraint: No Duplicate Types</b></p><p>The |
880 | the digits and letters up to the terminating <code>;</code> provide a hexadecimal | 880 | same name <em class="rfc2119" title="Keyword in RFC 2119 context">MUST NOT</em> appear more than once in a single mixed-content declaration.</p></div><p>Examples of mixed content declarations:</p><div class="exampleInner"><pre><!ELEMENT p (#PCDATA|a|ul|b|i|em)*> |
881 | representation of the character's code point in ISO/IEC 10646. If it begins | 881 | <!ELEMENT p (#PCDATA | %font; | %phrase; | %special; | %form;)* > |
882 | just with "<code>&#</code>", the digits up to the terminating <code>;</code> | 882 | <!ELEMENT b (#PCDATA)></pre></div></div></div><div class="div2"> <h3><a name="attdecls" id="attdecls"/>3.3 Attribute-List Declarations</h3><p> |
883 | provide a decimal representation of the character's code point.</p><p>[<a name="dt-entref" id="dt-entref" title="Entity Reference">Definition</a>: An <b>entity reference</b> | 883 | <a title="Attribute" href="#dt-attr">Attributes</a> are used to associate name-value |
884 | refers to the content of a named entity.] [<a name="dt-GERef" id="dt-GERef" title="General Entity Reference">Definition</a>: References to parsed general entities use | 884 | pairs with <a title="Element" href="#dt-element">elements</a>. Attribute specifications |
885 | ampersand (<code>&</code>) and semicolon (<code>;</code>) as delimiters.] [<a name="dt-PERef" id="dt-PERef" title="Parameter-entity reference">Definition</a>: <b>Parameter-entity references</b> | 885 | <em class="rfc2119" title="Keyword in RFC 2119 context">MUST NOT</em> appear outside of <a title="Start-Tag" href="#dt-stag">start-tags</a> and <a title="empty-element tag" href="#dt-eetag">empty-element tags</a>; thus, the productions used to |
886 | use percent-sign (<code>%</code>) and semicolon (<code>;</code>) as delimiters.]</p> <h5><a name="IDAS0DU" id="IDAS0DU" />Entity Reference</h5><table class="scrap" summary="Scrap"><tbody><tr valign="baseline"><td><a name="NT-Reference" id="NT-Reference" />[67] </td><td><code>Reference</code></td><td> ::= </td><td><code><a href="#NT-EntityRef">EntityRef</a> | <a href="#NT-CharRef">CharRef</a></code></td></tr></tbody><tbody><tr valign="baseline"><td><a name="NT-EntityRef" id="NT-EntityRef" />[68] </td><td><code>EntityRef</code></td><td> ::= </td><td><code>'&' <a href="#NT-Name">Name</a> ';'</code></td><td><a href="#wf-entdeclared">[WFC: Entity Declared]</a></td></tr><tr valign="baseline"><td /><td /><td /><td /><td><a href="#vc-entdeclared">[VC: Entity Declared]</a></td></tr><tr valign="baseline"><td /><td /><td /><td /><td><a href="#textent">[WFC: Parsed Entity]</a></td></tr><tr valign="baseline"><td /><td /><td /><td /><td><a href="#norecursion">[WFC: No Recursion]</a></td></tr></tbody><tbody><tr valign="baseline"><td><a name="NT-PEReference" id="NT-PEReference" />[69] </td><td><code>PEReference</code></td><td> ::= </td><td><code>'%' <a href="#NT-Name">Name</a> ';'</code></td><td><a href="#vc-entdeclared">[VC: Entity Declared]</a></td></tr><tr valign="baseline"><td /><td /><td /><td /><td><a href="#norecursion">[WFC: No Recursion]</a></td></tr><tr valign="baseline"><td /><td /><td /><td /><td><a href="#indtd">[WFC: In DTD]</a></td></tr></tbody></table><div class="constraint"><p class="prefix"><a name="wf-entdeclared" id="wf-entdeclared" /><b>Well-formedness constraint: Entity Declared</b></p><p>In a document | 886 | recognize them appear in <a href="#sec-starttags"><b>3.1 Start-Tags, End-Tags, and Empty-Element Tags</b></a>. Attribute-list declarations |
887 | without any DTD, a document with only an internal DTD subset which contains | 887 | <span>may</span> be used:</p><ul><li><p>To define the set of attributes pertaining to a given element type.</p></li><li><p>To establish type constraints for these attributes.</p></li><li><p>To provide <a title="Attribute Default" href="#dt-default">default values</a> for |
888 | no parameter entity references, or a document with "<code>standalone='yes'</code>", for | 888 | attributes.</p></li></ul><p> |
889 | an entity reference that does not occur within the external subset or a parameter | 889 | [<a name="dt-attdecl" id="dt-attdecl" title="Attribute-List Declaration">Definition</a>: |
890 | entity, the <a href="#NT-Name">Name</a> given in the entity reference <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> <a title="match" href="#dt-match">match</a> that in an <a href="#sec-entity-decl"><cite>entity | 890 | <b>Attribute-list |
891 | declaration</cite></a> that does not occur within the external subset or a | 891 | declarations</b> specify the name, data type, and default value (if any) |
892 | parameter entity, except that well-formed documents need not declare | 892 | of each attribute associated with a given element type:] |
893 | any of the following entities: <code>amp</code>, | 893 | </p> <h5><a name="IDAUAJS" id="IDAUAJS"/>Attribute-list Declaration</h5><table class="scrap" summary="Scrap"><tbody><tr valign="baseline"><td><a name="NT-AttlistDecl" id="NT-AttlistDecl"/>[52] </td><td><code>AttlistDecl</code></td><td> ::= </td><td><code>'<!ATTLIST' <a href="#NT-S">S</a> |
894 | <code>lt</code>, | 894 | <a href="#NT-Name">Name</a> |
895 | <code>gt</code>, | 895 | <a href="#NT-AttDef">AttDef</a>* <a href="#NT-S">S</a>? '>'</code></td></tr></tbody><tbody><tr valign="baseline"><td><a name="NT-AttDef" id="NT-AttDef"/>[53] </td><td><code>AttDef</code></td><td> ::= </td><td><code> |
896 | <code>apos</code>, | 896 | <a href="#NT-S">S</a> |
897 | <code>quot</code>. The | 897 | <a href="#NT-Name">Name</a> |
898 | declaration of a general entity <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> precede any reference to it which appears | 898 | <a href="#NT-S">S</a> |
899 | in a default value in an attribute-list declaration.</p><p><span>Note | 899 | <a href="#NT-AttType">AttType</a> |
900 | that non-validating processors are <a href="#include-if-valid"><cite>not | 900 | <a href="#NT-S">S</a> |
901 | obligated to</cite></a> to read and process entity declarations occurring in parameter entities or in | 901 | <a href="#NT-DefaultDecl">DefaultDecl</a> |
902 | the external subset</span>; for such documents, | 902 | </code></td></tr></tbody></table><p>The <a href="#NT-Name">Name</a> in the <a href="#NT-AttlistDecl">AttlistDecl</a> |
903 | the rule that an entity must be declared is a well-formedness constraint only | 903 | rule is the type of an element. At user option, an XML processor <em class="rfc2119" title="Keyword in RFC 2119 context">MAY</em> issue |
904 | if <a href="#sec-rmd"><cite>standalone='yes'</cite></a>.</p></div><div class="constraint"><p class="prefix"><a name="vc-entdeclared" id="vc-entdeclared" /><b>Validity constraint: Entity Declared</b></p><p>In a document with | 904 | a warning if attributes are declared for an element type not itself declared, |
905 | an external subset or external parameter entities with "<code>standalone='no'</code>", | 905 | but this is not an error. The <a href="#NT-Name">Name</a> in the <a href="#NT-AttDef">AttDef</a> |
906 | the <a href="#NT-Name">Name</a> given in the entity reference <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> <a title="match" href="#dt-match">match</a> that in an <a href="#sec-entity-decl"><cite>entity | 906 | rule is the name of the attribute.</p><p>When more than one <a href="#NT-AttlistDecl">AttlistDecl</a> is provided |
907 | declaration</cite></a>. For interoperability, valid documents <em class="rfc2119" title="Keyword in RFC 2119 context">SHOULD</em> declare | 907 | for a given element type, the contents of all those provided are merged. When |
908 | the entities <code>amp</code>, | 908 | more than one definition is provided for the same attribute of a given element |
909 | <code>lt</code>, | 909 | type, the first declaration is binding and later declarations are ignored. <a title="For interoperability" href="#dt-interop">For interoperability,</a> writers of DTDs <span>may</span> choose |
910 | <code>gt</code>, | 910 | to provide at most one attribute-list declaration for a given element type, |
911 | <code>apos</code>, | 911 | at most one attribute definition for a given attribute name in an attribute-list |
912 | <code>quot</code>, in the form specified in <a href="#sec-predefined-ent"><b>4.6 Predefined Entities</b></a>. | 912 | declaration, and at least one attribute definition in each attribute-list |
913 | The declaration of a parameter entity <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> precede any reference to it. Similarly, | 913 | declaration. For interoperability, an XML processor <em class="rfc2119" title="Keyword in RFC 2119 context">MAY</em> at user option |
914 | the declaration of a general entity <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> precede any attribute-list | 914 | issue a warning when more than one attribute-list declaration is provided |
915 | declaration containing a default value with a direct or indirect reference | 915 | for a given element type, or more than one attribute definition is provided |
916 | to that general entity.</p></div><div class="constraint"><p class="prefix"><a name="textent" id="textent" /><b>Well-formedness constraint: Parsed Entity</b></p><p>An entity reference <em class="rfc2119" title="Keyword in RFC 2119 context">MUST | 916 | for a given attribute, but this is not an error.</p><div class="div3"> <h4><a name="sec-attribute-types" id="sec-attribute-types"/>3.3.1 Attribute Types</h4><p>XML attribute types are of three kinds: a string type, a set of tokenized |
917 | NOT</em> contain the name of an <a title="Unparsed Entity" href="#dt-unparsed">unparsed entity</a>. | 917 | types, and enumerated types. The string type may take any literal string as |
918 | Unparsed entities may be referred to only in <a title="Attribute Value" href="#dt-attrval">attribute | 918 | a value; the tokenized types |
919 | values</a> declared to be of type <b>ENTITY</b> or <b>ENTITIES</b>.</p></div><div class="constraint"><p class="prefix"><a name="norecursion" id="norecursion" /><b>Well-formedness constraint: No Recursion</b></p><p>A parsed entity <em class="rfc2119" title="Keyword in RFC 2119 context">MUST NOT</em> contain a recursive reference to itself, either directly or indirectly.</p></div><div class="constraint"><p class="prefix"><a name="indtd" id="indtd" /><b>Well-formedness constraint: In DTD</b></p><p>Parameter-entity references <span class="mustard"><em class="rfc2119" title="Keyword in RFC 2119 context">MUST NOT</em> appear outside</span> | 919 | <span> |
920 | the <a title="Document Type Declaration" href="#dt-doctype">DTD</a>.</p></div><p>Examples of character and entity references:</p><div class="exampleInner"><pre>Type <key>less-than</key> (&#x3C;) to save options. | 920 | are more constrained</span>. |
921 | This document was prepared on &docdate; and | 921 | The validity constraints noted in the grammar are applied after the attribute |
922 | is classified &security-level;.</pre></div><p>Example of a parameter-entity reference:</p><div class="exampleInner"><pre><!-- declare the parameter entity "ISOLat2"... --> | 922 | value has been normalized as described in <a href="#AVNormalize"><b>3.3.3 Attribute-Value Normalization</b></a>.</p> <h5><a name="IDAVEJS" id="IDAVEJS"/>Attribute Types</h5><table class="scrap" summary="Scrap"><tbody><tr valign="baseline"><td><a name="NT-AttType" id="NT-AttType"/>[54] </td><td><code>AttType</code></td><td> ::= </td><td><code> |
923 | <!ENTITY % ISOLat2 | 923 | <a href="#NT-StringType">StringType</a> | <a href="#NT-TokenizedType">TokenizedType</a> |
924 | SYSTEM "http://www.xml.com/iso/isolat2-xml.entities" > | 924 | | <a href="#NT-EnumeratedType">EnumeratedType</a> |
925 | <!-- ... now reference it. --> | 925 | </code></td></tr><tr valign="baseline"><td><a name="NT-StringType" id="NT-StringType"/>[55] </td><td><code>StringType</code></td><td> ::= </td><td><code>'CDATA'</code></td></tr><tr valign="baseline"><td><a name="NT-TokenizedType" id="NT-TokenizedType"/>[56] </td><td><code>TokenizedType</code></td><td> ::= </td><td><code>'ID'</code></td><td><a href="#id">[VC: ID]</a></td></tr><tr valign="baseline"><td/><td/><td/><td/><td><a href="#one-id-per-el">[VC: One ID per Element Type]</a></td></tr><tr valign="baseline"><td/><td/><td/><td/><td><a href="#id-default">[VC: ID Attribute Default]</a></td></tr><tr valign="baseline"><td/><td/><td/><td><code>| 'IDREF'</code></td><td><a href="#idref">[VC: IDREF]</a></td></tr><tr valign="baseline"><td/><td/><td/><td><code>| 'IDREFS'</code></td><td><a href="#idref">[VC: IDREF]</a></td></tr><tr valign="baseline"><td/><td/><td/><td><code>| 'ENTITY'</code></td><td><a href="#entname">[VC: Entity Name]</a></td></tr><tr valign="baseline"><td/><td/><td/><td><code>| 'ENTITIES'</code></td><td><a href="#entname">[VC: Entity Name]</a></td></tr><tr valign="baseline"><td/><td/><td/><td><code>| 'NMTOKEN'</code></td><td><a href="#nmtok">[VC: Name Token]</a></td></tr><tr valign="baseline"><td/><td/><td/><td><code>| 'NMTOKENS'</code></td><td><a href="#nmtok">[VC: Name Token]</a></td></tr></tbody></table><div class="constraint"><p class="prefix"><a name="id" id="id"/><b>Validity constraint: ID</b></p><p>Values of type <b>ID</b> |
926 | %ISOLat2;</pre></div></div><div class="div2"> <h3><a name="sec-entity-decl" id="sec-entity-decl" />4.2 Entity Declarations</h3><p>[<a name="dt-entdecl" id="dt-entdecl" title="entity declaration">Definition</a>: Entities are declared | 926 | <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> match the <a href="#NT-Name">Name</a> production. A name <em class="rfc2119" title="Keyword in RFC 2119 context">MUST NOT</em> appear more than once |
927 | thus:]</p> <h5><a name="IDAECEU" id="IDAECEU" />Entity Declaration</h5><table class="scrap" summary="Scrap"><tbody><tr valign="baseline"><td><a name="NT-EntityDecl" id="NT-EntityDecl" />[70] </td><td><code>EntityDecl</code></td><td> ::= </td><td><code><a href="#NT-GEDecl">GEDecl</a> | <a href="#NT-PEDecl">PEDecl</a></code></td></tr><tr valign="baseline"><td><a name="NT-GEDecl" id="NT-GEDecl" />[71] </td><td><code>GEDecl</code></td><td> ::= </td><td><code>'<!ENTITY' <a href="#NT-S">S</a> <a href="#NT-Name">Name</a> <a href="#NT-S">S</a> <a href="#NT-EntityDef">EntityDef</a> <a href="#NT-S">S</a>? | 927 | in an XML document as a value of this type; i.e., ID values <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> uniquely |
928 | '>'</code></td></tr><tr valign="baseline"><td><a name="NT-PEDecl" id="NT-PEDecl" />[72] </td><td><code>PEDecl</code></td><td> ::= </td><td><code>'<!ENTITY' <a href="#NT-S">S</a> '%' <a href="#NT-S">S</a> <a href="#NT-Name">Name</a> <a href="#NT-S">S</a> <a href="#NT-PEDef">PEDef</a> <a href="#NT-S">S</a>? '>'</code></td></tr><tr valign="baseline"><td><a name="NT-EntityDef" id="NT-EntityDef" />[73] </td><td><code>EntityDef</code></td><td> ::= </td><td><code><a href="#NT-EntityValue">EntityValue</a>| (<a href="#NT-ExternalID">ExternalID</a> <a href="#NT-NDataDecl">NDataDecl</a>?)</code></td></tr><tr valign="baseline"><td><a name="NT-PEDef" id="NT-PEDef" />[74] </td><td><code>PEDef</code></td><td> ::= </td><td><code><a href="#NT-EntityValue">EntityValue</a> | <a href="#NT-ExternalID">ExternalID</a></code></td></tr></tbody></table><p>The <a href="#NT-Name">Name</a> identifies the entity in an <a title="Entity Reference" href="#dt-entref">entity | 928 | identify the elements which bear them.</p></div><div class="constraint"><p class="prefix"><a name="one-id-per-el" id="one-id-per-el"/><b>Validity constraint: One ID per Element Type</b></p><p>An element type <em class="rfc2119" title="Keyword in RFC 2119 context">MUST NOT</em> have more than one ID attribute specified.</p></div><div class="constraint"><p class="prefix"><a name="id-default" id="id-default"/><b>Validity constraint: ID Attribute Default</b></p><p>An ID attribute |
929 | reference</a> or, in the case of an unparsed entity, in the value of | 929 | <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> have a declared default of <b>#IMPLIED</b> or <b>#REQUIRED</b>.</p></div><div class="constraint"><p class="prefix"><a name="idref" id="idref"/><b>Validity constraint: IDREF</b></p><p>Values of type <b>IDREF</b> |
930 | an <b>ENTITY</b> or <b>ENTITIES</b> attribute. If the same entity is declared | 930 | <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> |
931 | more than once, the first declaration encountered is binding; at user option, | 931 | match the <a href="#NT-Name">Name</a> production, and values of type <b>IDREFS</b> |
932 | an XML processor <em class="rfc2119" title="Keyword in RFC 2119 context">MAY</em> issue a warning if entities are declared multiple times.</p><div class="div3"> <h4><a name="sec-internal-ent" id="sec-internal-ent" />4.2.1 Internal Entities</h4><p>[<a name="dt-internent" id="dt-internent" title="Internal Entity Replacement Text">Definition</a>: If the | 932 | <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> match <a href="#NT-Names">Names</a>; each <a href="#NT-Name">Name</a> |
933 | entity definition is an <a href="#NT-EntityValue">EntityValue</a>, the defined | 933 | <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> match the value of an ID attribute on some element in the XML document; |
934 | entity is called an <b>internal entity</b>. There is no separate physical | 934 | i.e. <b>IDREF</b> values <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> match the value of some ID attribute.</p></div><div class="constraint"><p class="prefix"><a name="entname" id="entname"/><b>Validity constraint: Entity Name</b></p><p>Values of type <b>ENTITY</b> |
935 | storage object, and the content of the entity is given in the declaration.] | 935 | <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> match the <a href="#NT-Name">Name</a> production, values of type <b>ENTITIES</b> |
936 | Note that some processing of entity and character references in the <a title="Literal Entity Value" href="#dt-litentval">literal entity value</a> may be required to produce | 936 | <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> match <a href="#NT-Names">Names</a>; each <a href="#NT-Name">Name</a> |
937 | the correct <a title="Replacement Text" href="#dt-repltext">replacement text</a>: see <a href="#intern-replacement"><b>4.5 Construction of Entity Replacement Text</b></a>.</p><p>An internal entity is a <a title="Text Entity" href="#dt-parsedent">parsed entity</a>.</p><p>Example of an internal entity declaration:</p><div class="exampleInner"><pre><!ENTITY Pub-Status "This is a pre-release of the | 937 | <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> match the name of an <a title="Unparsed Entity" href="#dt-unparsed">unparsed entity</a> |
938 | specification."></pre></div></div><div class="div3"> <h4><a name="sec-external-ent" id="sec-external-ent" />4.2.2 External Entities</h4><p>[<a name="dt-extent" id="dt-extent" title="External Entity">Definition</a>: If the entity is not internal, | 938 | declared in the <a title="Document Type Declaration" href="#dt-doctype">DTD</a>.</p></div><div class="constraint"><p class="prefix"><a name="nmtok" id="nmtok"/><b>Validity constraint: Name Token</b></p><p>Values of type <b>NMTOKEN</b> |
939 | it is an <b>external entity</b>, declared as follows:]</p> <h5><a name="IDAUIEU" id="IDAUIEU" />External Entity Declaration</h5><table class="scrap" summary="Scrap"><tbody><tr valign="baseline"><td><a name="NT-ExternalID" id="NT-ExternalID" />[75] </td><td><code>ExternalID</code></td><td> ::= </td><td><code>'SYSTEM' <a href="#NT-S">S</a> <a href="#NT-SystemLiteral">SystemLiteral</a></code></td></tr><tr valign="baseline"><td /><td /><td /><td><code>| 'PUBLIC' <a href="#NT-S">S</a> <a href="#NT-PubidLiteral">PubidLiteral</a> <a href="#NT-S">S</a> <a href="#NT-SystemLiteral">SystemLiteral</a></code></td></tr></tbody><tbody><tr valign="baseline"><td><a name="NT-NDataDecl" id="NT-NDataDecl" />[76] </td><td><code>NDataDecl</code></td><td> ::= </td><td><code><a href="#NT-S">S</a> 'NDATA' <a href="#NT-S">S</a> <a href="#NT-Name">Name</a></code></td><td><a href="#not-declared">[VC: Notation Declared]</a></td></tr></tbody></table><p>If the <a href="#NT-NDataDecl">NDataDecl</a> is present, this is a general <a title="Unparsed Entity" href="#dt-unparsed">unparsed entity</a>; otherwise it is a parsed entity.</p><div class="constraint"><p class="prefix"><a name="not-declared" id="not-declared" /><b>Validity constraint: Notation Declared</b></p><p>The <a href="#NT-Name">Name</a> <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> match the declared name of a <a title="Notation" href="#dt-notation">notation</a>.</p></div><p>[<a name="dt-sysid" id="dt-sysid" title="System Identifier">Definition</a>: The <a href="#NT-SystemLiteral">SystemLiteral</a> is called the entity's <b>system | 939 | <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> match the <a href="#NT-Nmtoken">Nmtoken</a> production; values of type <b>NMTOKENS</b> |
940 | identifier</b>. It is <span>meant to be | 940 | <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> match <a href="#NT-Nmtokens">Nmtokens</a>.</p></div><p> |
941 | converted to</span> a URI reference | 941 | [<a name="dt-enumerated" id="dt-enumerated" title="Enumerated Attribute
Values">Definition</a>: |
942 | (as defined in <a href="#rfc2396">[IETF RFC 2396]</a>, updated by <a href="#rfc2732">[IETF RFC 2732]</a>), | 942 | <b>Enumerated attributes</b> |
943 | <span>as part of the | 943 | |
944 | process of dereferencing it</span> to obtain input for the XML processor to construct the | 944 | <span>have a list of allowed |
945 | entity's replacement text.] It is an error for a fragment identifier | 945 | values in their declaration</span> |
946 | (beginning with a <code>#</code> character) to be part of a system identifier. | 946 | ]. <span>They <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> take one of those values. </span>There are two kinds of enumerated<span> attribute</span> types:</p> <h5><a name="IDAANJS" id="IDAANJS"/>Enumerated Attribute Types</h5><table class="scrap" summary="Scrap"><tbody><tr valign="baseline"><td><a name="NT-EnumeratedType" id="NT-EnumeratedType"/>[57] </td><td><code>EnumeratedType</code></td><td> ::= </td><td><code> |
947 | Unless otherwise provided by information outside the scope of this specification | 947 | <a href="#NT-NotationType">NotationType</a> |
948 | (e.g. a special XML element type defined by a particular DTD, or a processing | 948 | | <a href="#NT-Enumeration">Enumeration</a> |
949 | instruction defined by a particular application specification), relative URIs | 949 | </code></td></tr></tbody><tbody><tr valign="baseline"><td><a name="NT-NotationType" id="NT-NotationType"/>[58] </td><td><code>NotationType</code></td><td> ::= </td><td><code>'NOTATION' <a href="#NT-S">S</a> '(' <a href="#NT-S">S</a>? <a href="#NT-Name">Name</a> (<a href="#NT-S">S</a>? '|' <a href="#NT-S">S</a>? <a href="#NT-Name">Name</a>)* <a href="#NT-S">S</a>? ')' </code></td><td><a href="#notatn">[VC: Notation Attributes]</a></td></tr><tr valign="baseline"><td/><td/><td/><td/><td><a href="#OneNotationPer">[VC: One Notation Per Element Type]</a></td></tr><tr valign="baseline"><td/><td/><td/><td/><td><a href="#NoNotationEmpty">[VC: No Notation on Empty Element]</a></td></tr><tr valign="baseline"><td/><td/><td/><td/><td><a href="#NoDuplicateTokens">[VC: No Duplicate |
950 | are relative to the location of the resource within which the entity declaration | 950 | Tokens]</a></td></tr></tbody><tbody><tr valign="baseline"><td><a name="NT-Enumeration" id="NT-Enumeration"/>[59] </td><td><code>Enumeration</code></td><td> ::= </td><td><code>'(' <a href="#NT-S">S</a>? <a href="#NT-Nmtoken">Nmtoken</a> |
951 | occurs. <span>This is defined to | 951 | (<a href="#NT-S">S</a>? '|' <a href="#NT-S">S</a>? <a href="#NT-Nmtoken">Nmtoken</a>)* <a href="#NT-S">S</a>? ')'</code></td><td><a href="#enum">[VC: Enumeration]</a></td></tr><tr valign="baseline"><td/><td/><td/><td/><td><a href="#NoDuplicateTokens">[VC: No Duplicate |
952 | be the external entity containing the '<' which starts the declaration, at the | 952 | Tokens]</a></td></tr></tbody></table><p>A <b>NOTATION</b> attribute identifies a <a title="Notation" href="#dt-notation">notation</a>, |
953 | point when it is parsed as a declaration.</span> | 953 | declared in the DTD with associated system and/or public identifiers, to be |
954 | A URI might thus be relative to the <a title="Document Entity" href="#dt-docent">document | 954 | used in interpreting the element to which the attribute is attached.</p><div class="constraint"><p class="prefix"><a name="notatn" id="notatn"/><b>Validity constraint: Notation Attributes</b></p><p>Values of this type |
955 | entity</a>, to the entity containing the <a title="Document Type Declaration" href="#dt-doctype">external | 955 | <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> match one of the <a href="#Notations"><cite>notation</cite></a> names |
956 | DTD subset</a>, or to some other <a title="External Entity" href="#dt-extent">external parameter | 956 | included in the declaration; all notation names in the declaration <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> be |
957 | entity</a>. <span>Attempts to | 957 | declared.</p></div><div class="constraint"><p class="prefix"><a name="OneNotationPer" id="OneNotationPer"/><b>Validity constraint: One Notation Per Element Type</b></p><p>An element type <em class="rfc2119" title="Keyword in RFC 2119 context">MUST NOT</em> have more than one <b>NOTATION</b> |
958 | retrieve the resource identified by a URI <em class="rfc2119" title="Keyword in RFC 2119 context">MAY</em> be redirected at the parser | 958 | attribute specified.</p></div><div class="constraint"><p class="prefix"><a name="NoNotationEmpty" id="NoNotationEmpty"/><b>Validity constraint: No Notation on Empty Element</b></p><p> |
959 | level (for example, in an entity resolver) or below (at the protocol level, | 959 | <a title="For Compatibility" href="#dt-compat">For compatibility</a>, |
960 | for example, via an HTTP <code>Location:</code> header). In the absence of additional | 960 | an attribute of type <b>NOTATION</b> |
961 | information outside the scope of this specification within the resource, | 961 | <em class="rfc2119" title="Keyword in RFC 2119 context">MUST NOT</em> be declared on an element |
962 | the base URI of a resource is always the URI of the actual resource returned. | 962 | declared <b>EMPTY</b>.</p></div><div class="constraint"><p class="prefix"><a name="NoDuplicateTokens" id="NoDuplicateTokens"/><b>Validity constraint: No Duplicate |
963 | In other words, it is the URI of the resource retrieved after all redirection | 963 | Tokens</b></p><p>The notation names in a single <a href="#NT-NotationType">NotationType</a> |
964 | has occurred.</span></p><p>System | 964 | attribute declaration, as well as the <a href="#NT-Nmtoken">NmToken</a>s in a single |
965 | identifiers (and other XML strings meant to be used as URI references) <em class="rfc2119" title="Keyword in RFC 2119 context">MAY</em> contain | 965 | <a href="#NT-Enumeration">Enumeration</a> attribute declaration, <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> all be distinct.</p></div><div class="constraint"><p class="prefix"><a name="enum" id="enum"/><b>Validity constraint: Enumeration</b></p><p>Values of this type <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> match |
966 | characters that, according to <a href="#rfc2396">[IETF RFC 2396]</a> and <a href="#rfc2732">[IETF RFC 2732]</a>, | 966 | one of the <a href="#NT-Nmtoken">Nmtoken</a> tokens in the declaration.</p></div><p> |
967 | must be escaped before a URI can be used to retrieve the referenced resource. The | 967 | <a title="For interoperability" href="#dt-interop">For interoperability,</a> the same <a href="#NT-Nmtoken">Nmtoken</a> |
968 | characters to be escaped are the control characters #x0 to #x1F and #x7F (most of | 968 | <em class="rfc2119" title="Keyword in RFC 2119 context">SHOULD NOT</em> occur more than once in the enumerated |
969 | which cannot appear in XML), space #x20, the delimiters '<' #x3C, '>' #x3E and | 969 | attribute types of a single element type.</p></div><div class="div3"> <h4><a name="sec-attr-defaults" id="sec-attr-defaults"/>3.3.2 Attribute Defaults</h4><p>An <a title="Attribute-List Declaration" href="#dt-attdecl">attribute declaration</a> provides information |
970 | '"' #x22, the <em>unwise</em> characters '{' #x7B, '}' #x7D, '|' #x7C, '\' #x5C, '^' #x5E and | 970 | on whether the attribute's presence is <em class="rfc2119" title="Keyword in RFC 2119 context">REQUIRED</em>, and if not, how an XML processor |
971 | '`' #x60, as well as all characters above #x7F. Since escaping is not always a fully | 971 | is to react if a declared attribute is absent in a document.</p> <h5><a name="IDAGUJS" id="IDAGUJS"/>Attribute Defaults</h5><table class="scrap" summary="Scrap"><tbody><tr valign="baseline"><td><a name="NT-DefaultDecl" id="NT-DefaultDecl"/>[60] </td><td><code>DefaultDecl</code></td><td> ::= </td><td><code>'#REQUIRED' | '#IMPLIED' </code></td></tr><tr valign="baseline"><td/><td/><td/><td><code>| (('#FIXED' <a href="#NT-S">S</a>)? <a href="#NT-AttValue">AttValue</a>)</code></td><td><a href="#RequiredAttr">[VC: Required Attribute]</a></td></tr><tr valign="baseline"><td/><td/><td/><td/><td><a href="#defattrvalid">[VC: Attribute Default Value Syntactically Correct]</a></td></tr><tr valign="baseline"><td/><td/><td/><td/><td><a href="#CleanAttrVals">[WFC: No < in Attribute Values]</a></td></tr><tr valign="baseline"><td/><td/><td/><td/><td><a href="#FixedAttr">[VC: Fixed Attribute Default]</a></td></tr><tr valign="baseline"><td/><td/><td/><td/><td><a href="#NoExternalRefs">[WFC: No External Entity References]</a></td></tr></tbody></table><p>In an attribute declaration, <b>#REQUIRED</b> means that the attribute |
972 | reversible process, it <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> be performed only when absolutely necessary and as late | 972 | <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> always be provided, <b>#IMPLIED</b> that no default value is provided. |
973 | as possible in a processing chain. In particular, neither the process of converting | 973 | |
974 | a relative URI to an absolute one nor the process of passing a URI reference to a | 974 | [<a name="dt-default" id="dt-default" title="Attribute Default">Definition</a>: If |
975 | process or software component responsible for dereferencing it <em class="rfc2119" title="Keyword in RFC 2119 context">SHOULD</em> trigger escaping. | 975 | the declaration is neither <b>#REQUIRED</b> nor <b>#IMPLIED</b>, then |
976 | When escaping does occur, it <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> be performed as follows:</p><ol type="1"><li><p>Each | 976 | the <a href="#NT-AttValue">AttValue</a> value contains the declared <b>default</b> |
977 | character <span>to be escaped</span> | 977 | value; the <b>#FIXED</b> keyword states that the attribute <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> always have |
978 | is <span>represented in</span> | 978 | the default value. |
979 | UTF-8 <span><a href="#Unicode">[Unicode]</a></span> | 979 | When an XML processor encounters |
980 | as one or more bytes.</p></li><li><p><span>The resulting bytes</span> | 980 | an element |
981 | are escaped with | 981 | without a specification for an attribute for which it has read a default |
982 | the URI escaping mechanism (that is, converted to <code>%</code><var>HH</var>, | 982 | value declaration, it <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> report the attribute with the declared default |
983 | where HH is the hexadecimal notation of the byte value).</p></li><li><p>The original character is replaced by the resulting character sequence.</p></li></ol><p>[<a name="dt-pubid" id="dt-pubid" title="Public identifier">Definition</a>: In addition to a system | 983 | value to the application.] |
984 | identifier, an external identifier <em class="rfc2119" title="Keyword in RFC 2119 context">MAY</em> include a <b>public identifier</b>.] | 984 | </p><div class="constraint"><p class="prefix"><a name="RequiredAttr" id="RequiredAttr"/><b>Validity constraint: Required Attribute</b></p><p>If the default |
985 | An XML processor attempting to retrieve the entity's content <em class="rfc2119" title="Keyword in RFC 2119 context">MAY</em> use | 985 | declaration is the keyword <b>#REQUIRED</b>, then the attribute <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> be |
986 | <span>any combination of | 986 | specified for all elements of the type in the attribute-list declaration.</p></div><div class="constraint"><p class="prefix"><a name="defattrvalid" id="defattrvalid"/><b>Validity constraint: Attribute Default Value Syntactically Correct</b></p><p>The declared default value <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> meet the syntactic |
987 | the public and system identifiers as well as additional information outside the | 987 | constraints of the declared attribute type.<span> |
988 | scope of this specification</span> to try to generate an alternative URI reference. | 988 | That is, the default value of an attribute:</span> |
989 | If the processor is unable to do so, it <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> use the URI | 989 | </p><ul><li><p> |
990 | reference specified in the system literal. Before a match is attempted, | 990 | of type IDREF or ENTITY must match the <a href="#NT-Name">Name</a> production;</p></li><li><p> |
991 | all strings of white space in the public identifier <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> be normalized to | 991 | of type IDREFS or ENTITIES must match the <a href="#NT-Names">Names</a> production;</p></li><li><p> |
992 | single space characters (#x20), and leading and trailing white space <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> | 992 | of type NMTOKEN must match the <a href="#NT-Nmtoken">Nmtoken</a> production;</p></li><li><p> |
993 | be removed.</p><p>Examples of external entity declarations:</p><div class="exampleInner"><pre><!ENTITY open-hatch | 993 | of type NMTOKENS must match the <a href="#NT-Nmtokens">Nmtokens</a> production;</p></li><li><p> |
994 | SYSTEM "http://www.textuality.com/boilerplate/OpenHatch.xml"> | 994 | of an <a href="#NT-EnumeratedType">enumerated type</a> (either a <a href="#NT-NotationType">NOTATION</a> type or an <a href="#NT-Enumeration">enumeration</a>) must match one of the enumerated values.</p></li></ul><p>Note that only the |
995 | <!ENTITY open-hatch | 995 | syntactic constraints of the type are required here; other constraints (e.g. |
996 | PUBLIC "-//Textuality//TEXT Standard open-hatch boilerplate//EN" | 996 | that the value be the name of a declared unparsed entity, for an attribute of |
997 | "http://www.textuality.com/boilerplate/OpenHatch.xml"> | 997 | type ENTITY) |
998 | <!ENTITY hatch-pic | 998 | <span> |
999 | SYSTEM "../grafix/OpenHatch.gif" | 999 | will be reported by a validating |
1000 | NDATA gif ></pre></div></div></div><div class="div2"> <h3><a name="TextEntities" id="TextEntities" />4.3 Parsed Entities</h3><div class="div3"> <h4><a name="sec-TextDecl" id="sec-TextDecl" />4.3.1 The Text Declaration</h4><p>External parsed entities <em class="rfc2119" title="Keyword in RFC 2119 context">SHOULD</em> each begin with a <b>text declaration</b>.</p> <h5><a name="IDAUPEU" id="IDAUPEU" />Text Declaration</h5><table class="scrap" summary="Scrap"><tbody><tr valign="baseline"><td><a name="NT-TextDecl" id="NT-TextDecl" />[77] </td><td><code>TextDecl</code></td><td> ::= </td><td><code>'<?xml' <a href="#NT-VersionInfo">VersionInfo</a>? <a href="#NT-EncodingDecl">EncodingDecl</a> <a href="#NT-S">S</a>? '?>'</code></td></tr></tbody></table><p>The text declaration <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> be provided literally, not by reference | 1000 | parser only if an element without a specification for this attribute |
1001 | to a parsed entity. <span class="mustard">The</span> text declaration | 1001 | actually occurs</span>.</p></div><div class="constraint"><p class="prefix"><a name="FixedAttr" id="FixedAttr"/><b>Validity constraint: Fixed Attribute Default</b></p><p>If an attribute |
1002 | <span class="mustard"><em class="rfc2119" title="Keyword in RFC 2119 context">MUST NOT</em></span> appear at any | 1002 | has a default value declared with the <b>#FIXED</b> keyword, instances of |
1003 | position other than the beginning of an external parsed entity. The text declaration | 1003 | that attribute <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> match the default value.</p></div><p>Examples of attribute-list declarations:</p><div class="exampleInner"><pre><!ATTLIST termdef |
1004 | in an external parsed entity is not considered part of its <a title="Replacement Text" href="#dt-repltext">replacement text</a>.</p></div><div class="div3"> <h4><a name="wf-entities" id="wf-entities" />4.3.2 Well-Formed Parsed Entities</h4><p>The document entity is well-formed if it matches the production labeled <a href="#NT-document">document</a>. An external general parsed entity is well-formed | 1004 | id ID #REQUIRED |
1005 | if it matches the production labeled <a href="#NT-extParsedEnt">extParsedEnt</a>. All | 1005 | name CDATA #IMPLIED> |
1006 | external parameter entities are well-formed by definition.</p> <h5><a name="IDA2REU" id="IDA2REU" />Well-Formed External Parsed Entity</h5><table class="scrap" summary="Scrap"><tbody><tr valign="baseline"><td><a name="NT-extParsedEnt" id="NT-extParsedEnt" />[78] </td><td><code>extParsedEnt</code></td><td> ::= </td><td><code><a href="#NT-TextDecl">TextDecl</a>? <a href="#NT-content">content</a> - <a href="#NT-Char">Char</a>* <a href="#NT-RestrictedChar">RestrictedChar</a> <a href="#NT-Char">Char</a>*</code></td></tr></tbody></table><p>An internal general parsed entity is well-formed if its replacement text | 1006 | <!ATTLIST list |
1007 | matches the production labeled <a href="#NT-content">content</a>. All internal | 1007 | type (bullets|ordered|glossary) "ordered"> |
1008 | parameter entities are well-formed by definition.</p><p>A consequence of well-formedness in <span>general</span> | 1008 | <!ATTLIST form |
1009 | entities is that the logical and physical | 1009 | method CDATA #FIXED "POST"></pre></div></div><div class="div3"> <h4><a name="AVNormalize" id="AVNormalize"/>3.3.3 Attribute-Value Normalization</h4><p>Before the value of an attribute is passed to the application or checked |
1010 | structures in an XML document are properly nested; no <a title="Start-Tag" href="#dt-stag">start-tag</a>, <a title="End Tag" href="#dt-etag">end-tag</a>, <a title="Empty" href="#dt-empty">empty-element tag</a>, <a title="Element" href="#dt-element">element</a>, <a title="Comment" href="#dt-comment">comment</a>, <a title="Processing instruction" href="#dt-pi">processing instruction</a>, <a title="Character Reference" href="#dt-charref">character | 1010 | for validity, the XML processor <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> normalize the attribute value by applying |
1011 | reference</a>, or <a title="Entity Reference" href="#dt-entref">entity reference</a> | 1011 | the algorithm below, or by using some other method such that the value passed |
1012 | can begin in one entity and end in another.</p></div><div class="div3"> <h4><a name="charencoding" id="charencoding" />4.3.3 Character Encoding in Entities</h4><p>Each external parsed entity in an XML document <em class="rfc2119" title="Keyword in RFC 2119 context">MAY</em> use a different encoding | 1012 | to the application is the same as that produced by the algorithm.</p><ol class="enumar"><li><p>All line breaks <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> have been normalized on input to #xA as described |
1013 | for its characters. All XML processors <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> be able to read entities in both | 1013 | in <a href="#sec-line-ends"><b>2.11 End-of-Line Handling</b></a>, so the rest of this algorithm operates |
1014 | the UTF-8 and UTF-16 encodings. The terms "UTF-8" | 1014 | on text normalized in this way.</p></li><li><p>Begin with a normalized value consisting of the empty string.</p></li><li><p>For each character, entity reference, or character reference in the |
1015 | and "UTF-16" in this specification do not apply to character | 1015 | unnormalized attribute value, beginning with the first and continuing to the |
1016 | encodings with any other labels, even if the encodings or labels are very | 1016 | last, do the following:</p><ul><li><p>For a character reference, append the referenced character to the |
1017 | similar to UTF-8 or UTF-16.</p><p>Entities encoded in UTF-16 <span><em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em></span> <span>and entities | 1017 | normalized value.</p></li><li><p>For an entity reference, recursively apply step 3 of this algorithm |
1018 | encoded in UTF-8 <em class="rfc2119" title="Keyword in RFC 2119 context">MAY</em></span> begin with the Byte Order Mark described in | 1018 | to the replacement text of the entity.</p></li><li><p>For a white space character (#x20, #xD, #xA, #x9), append a space |
1019 | ISO/IEC 10646 <a href="#ISO10646">[ISO/IEC 10646]</a> or Unicode <a href="#Unicode">[Unicode]</a> | 1019 | character (#x20) to the normalized value.</p></li><li><p>For another character, append the character to the normalized value.</p></li></ul></li></ol><p>If the attribute type is not CDATA, then the XML processor <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> further |
1020 | (the ZERO WIDTH NO-BREAK SPACE character, #xFEFF). This is an encoding signature, | 1020 | process the normalized attribute value by discarding any leading and trailing |
1021 | not part of either the markup or the character data of the XML document. XML | 1021 | space (#x20) characters, and by replacing sequences of space (#x20) characters |
1022 | processors <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> be able to use this character to differentiate between UTF-8 | 1022 | by a single space (#x20) character.</p><p>Note that if the unnormalized attribute value contains a character reference |
1023 | and UTF-16 encoded documents.</p><p>Although an XML processor is required to read only entities in the UTF-8 | 1023 | to a white space character other than space (#x20), the normalized value contains |
1024 | and UTF-16 encodings, it is recognized that other encodings are used around | 1024 | the referenced character itself (#xD, #xA or #x9). This contrasts with the |
1025 | the world, and it may be desired for XML processors to read entities that | 1025 | case where the unnormalized value contains a white space character (not a |
1026 | use them. In | 1026 | reference), which is replaced with a space character (#x20) in the normalized |
1027 | the absence of external character encoding information (such as MIME headers), | 1027 | value and also contrasts with the case where the unnormalized value contains |
1028 | parsed entities which are stored in an encoding other than UTF-8 or UTF-16 | 1028 | an entity reference whose replacement text contains a white space character; |
1029 | <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> begin with a text declaration (see <a href="#sec-TextDecl"><b>4.3.1 The Text Declaration</b></a>) containing | 1029 | being recursively processed, the white space character is replaced with a |
1030 | an encoding declaration:</p> <h5><a name="IDARVEU" id="IDARVEU" />Encoding Declaration</h5><table class="scrap" summary="Scrap"><tbody><tr valign="baseline"><td><a name="NT-EncodingDecl" id="NT-EncodingDecl" />[80] </td><td><code>EncodingDecl</code></td><td> ::= </td><td><code><a href="#NT-S">S</a> 'encoding' <a href="#NT-Eq">Eq</a> | 1030 | space character (#x20) in the normalized value.</p><p>All attributes for which no declaration has been read <em class="rfc2119" title="Keyword in RFC 2119 context">SHOULD</em> be treated |
1031 | ('"' <a href="#NT-EncName">EncName</a> '"' | "'" <a href="#NT-EncName">EncName</a> | 1031 | by a non-validating processor as if declared <b>CDATA</b>.</p><p>It |
1032 | "'" ) </code></td></tr></tbody><tbody><tr valign="baseline"><td><a name="NT-EncName" id="NT-EncName" />[81] </td><td><code>EncName</code></td><td> ::= </td><td><code>[A-Za-z] ([A-Za-z0-9._] | '-')*</code></td><td><i>/* Encoding | 1032 | is an error if an |
1033 | name contains only Latin characters */</i></td></tr></tbody></table><p>In the <a title="Document Entity" href="#dt-docent">document entity</a>, the encoding | 1033 | <a title="Attribute Value" href="#dt-attrval">attribute |
1034 | declaration is part of the <a title="XML Declaration" href="#dt-xmldecl">XML declaration</a>. | 1034 | value</a> contains a <a title="Entity Reference" href="#dt-entref">reference</a> to an |
1035 | The <a href="#NT-EncName">EncName</a> is the name of the encoding used.</p><p>In an encoding declaration, the values "<code>UTF-8</code>", "<code>UTF-16</code>", | 1035 | entity for which no declaration has been read.</p><p>Following are examples of attribute normalization. Given the following |
1036 | "<code>ISO-10646-UCS-2</code>", and "<code>ISO-10646-UCS-4</code>" <em class="rfc2119" title="Keyword in RFC 2119 context">SHOULD</em> be used | 1036 | declarations:</p><div class="exampleInner"><pre><!ENTITY d "&#xD;"> |
1037 | for the various encodings and transformations of Unicode / ISO/IEC 10646, | 1037 | <!ENTITY a "&#xA;"> |
1038 | the values "<code>ISO-8859-1</code>", "<code>ISO-8859-2</code>", | 1038 | <!ENTITY da "&#xD;&#xA;"></pre></div><p>the attribute specifications in the left column below would be normalized |
1039 | ... "<code>ISO-8859-</code><var>n</var>" (where <var>n</var> | 1039 | to the character sequences of the middle column if the attribute <code>a</code> |
1040 | is the part number) <em class="rfc2119" title="Keyword in RFC 2119 context">SHOULD</em> be used for the parts of ISO 8859, and | 1040 | is declared <b>NMTOKENS</b> and to those of the right columns if <code>a</code> |
1041 | the values "<code>ISO-2022-JP</code>", "<code>Shift_JIS</code>", | 1041 | is declared <b>CDATA</b>.</p><table border="1" frame="border" summary="Attribute normalization summary"><thead><tr><th>Attribute specification</th><th>a is NMTOKENS</th><th>a is CDATA</th></tr></thead><tbody><tr><td> |
1042 | and "<code>EUC-JP</code>" <em class="rfc2119" title="Keyword in RFC 2119 context">SHOULD</em> be used for the various encoded | 1042 | <div class="exampleInner"><pre>a=" |
1043 | forms of JIS X-0208-1997. It | 1043 | xyz"</pre></div> |
1044 | is <em class="rfc2119" title="Keyword in RFC 2119 context">RECOMMENDED</em> that character encodings registered (as <em>charset</em>s) | 1044 | </td><td> |
1045 | with the Internet Assigned Numbers Authority <a href="#IANA">[IANA-CHARSETS]</a>, | 1045 | <div class="exampleInner"><pre>x y z</pre></div> |
1046 | other than those just listed, be referred to using their registered names; | 1046 | </td><td> |
1047 | other encodings <em class="rfc2119" title="Keyword in RFC 2119 context">SHOULD</em> use names starting with an "x-" prefix. | 1047 | <div class="exampleInner"><pre>#x20 #x20 x y z</pre></div> |
1048 | XML processors <em class="rfc2119" title="Keyword in RFC 2119 context">SHOULD</em> match character encoding names in a case-insensitive | 1048 | </td></tr><tr><td> |
1049 | way and <em class="rfc2119" title="Keyword in RFC 2119 context">SHOULD</em> either interpret an IANA-registered name as the encoding registered | 1049 | <div class="exampleInner"><pre>a="&d;&d;A&a;&#x20;&a;B&da;"</pre></div> |
1050 | at IANA for that name or treat it as unknown (processors are, of course, not | 1050 | </td><td> |
1051 | required to support all IANA-registered encodings).</p><p>In the absence of information provided by an external transport protocol | 1051 | <div class="exampleInner"><pre>A #x20 B</pre></div> |
1052 | (e.g. HTTP or MIME), it is a <a title="Fatal Error" href="#dt-fatal">fatal error</a> for | 1052 | </td><td> |
1053 | an entity including an encoding declaration to be presented to the XML processor | 1053 | <div class="exampleInner"><pre>#x20 #x20 A #x20 #x20 #x20 B #x20 #x20</pre></div> |
1054 | in an encoding other than that named in the declaration, or for an entity which | 1054 | </td></tr><tr><td> |
1055 | begins with neither a Byte Order Mark | 1055 | <div class="exampleInner"><pre>a= |
1056 | nor an encoding declaration to use an encoding other than UTF-8. Note that | 1056 | "&#xd;&#xd;A&#xa;&#xa;B&#xd;&#xa;"</pre></div> |
1057 | since ASCII is a subset of UTF-8, ordinary ASCII entities do not strictly | 1057 | </td><td> |
1058 | need an encoding declaration.</p><p>It is a <a title="Fatal Error" href="#dt-fatal">fatal error</a> for a <a href="#NT-TextDecl">TextDecl</a> to occur other | 1058 | <div class="exampleInner"><pre>#xD #xD A #xA #xA B #xD #xA</pre></div> |
1059 | than at the beginning of an external entity.</p><p>It is a <a title="Fatal Error" href="#dt-fatal">fatal error</a> when an XML processor | 1059 | </td><td> |
1060 | encounters an entity with an encoding that it is unable to process. It | 1060 | <div class="exampleInner"><pre>#xD #xD A #xA #xA B #xD #xA</pre></div> |
1061 | is a <a title="Fatal Error" href="#dt-fatal">fatal error</a> if an XML entity is determined (via default, encoding declaration, | 1061 | </td></tr></tbody></table><p>Note that the last example is invalid (but well-formed) if <code>a</code> |
1062 | or higher-level protocol) to be in a certain encoding but contains <span>byte</span> | 1062 | is declared to be of type <b>NMTOKENS</b>.</p></div></div><div class="div2"> <h3><a name="sec-condition-sect" id="sec-condition-sect"/>3.4 Conditional Sections</h3><p> |
1063 | sequences that are not legal in that encoding. <span>Specifically, it is a | 1063 | [<a name="dt-cond-section" id="dt-cond-section" title="conditional section">Definition</a>: |
1064 | fatal error if an entity encoded in UTF-8 contains any irregular code unit sequences, | 1064 | <b>Conditional |
1065 | as defined in Unicode <a href="#Unicode">[Unicode]</a>.</span> <span>Unless an encoding | 1065 | sections</b> are portions of the <a title="Document Type Declaration" href="#dt-doctype">document type |
1066 | is determined by a higher-level protocol, </span>it is also a <a title="Fatal Error" href="#dt-fatal">fatal error</a> if an XML entity | 1066 | declaration external subset</a> or |
1067 | contains no encoding declaration and its content is not legal UTF-8 or UTF-16.</p><p>Examples of text declarations containing encoding declarations:</p><div class="exampleInner"><pre><?xml encoding='UTF-8'?> | 1067 | of external parameter entities which are included in, or excluded from, |
1068 | <?xml encoding='EUC-JP'?></pre></div></div><div class="div3"> <h4><a name="sec-version-info" id="sec-version-info" />4.3.4 Version Information in Entities</h4><p>Each entity, including the <a title="Document Entity" href="#dt-docent">document entity</a>, | 1068 | the logical structure of the DTD based on the keyword which governs them.] |
1069 | can be separately | 1069 | </p> <h5><a name="IDA1AKS" id="IDA1AKS"/>Conditional Section</h5><table class="scrap" summary="Scrap"><tbody><tr valign="baseline"><td><a name="NT-conditionalSect" id="NT-conditionalSect"/>[61] </td><td><code>conditionalSect</code></td><td> ::= </td><td><code> |
1070 | declared as XML 1.0 or XML 1.1. The version declaration appearing | 1070 | <a href="#NT-includeSect">includeSect</a> | <a href="#NT-ignoreSect">ignoreSect</a> |
1071 | in the document entity determines the version of the document as a | 1071 | </code></td></tr><tr valign="baseline"><td><a name="NT-includeSect" id="NT-includeSect"/>[62] </td><td><code>includeSect</code></td><td> ::= </td><td><code>'<![' <a href="#NT-S">S</a>? 'INCLUDE' <a href="#NT-S">S</a>? '[' <a href="#NT-extSubsetDecl">extSubsetDecl</a> |
1072 | whole. An XML 1.1 document may invoke XML 1.0 external entities, so | 1072 | ']]>' </code></td><td><a href="#condsec-nesting">[VC: Proper Conditional Section/PE Nesting]</a></td></tr><tr valign="baseline"><td><a name="NT-ignoreSect" id="NT-ignoreSect"/>[63] </td><td><code>ignoreSect</code></td><td> ::= </td><td><code>'<![' <a href="#NT-S">S</a>? 'IGNORE' <a href="#NT-S">S</a>? '[' <a href="#NT-ignoreSectContents">ignoreSectContents</a>* |
1073 | that otherwise duplicated versions of external entities, | 1073 | ']]>'</code></td><td><a href="#condsec-nesting">[VC: Proper Conditional Section/PE Nesting]</a></td></tr><tr valign="baseline"><td><a name="NT-ignoreSectContents" id="NT-ignoreSectContents"/>[64] </td><td><code>ignoreSectContents</code></td><td> ::= </td><td><code> |
1074 | particularly DTD external subsets, need not be maintained. However, | 1074 | <a href="#NT-Ignore">Ignore</a> ('<![' <a href="#NT-ignoreSectContents">ignoreSectContents</a> ']]>' <a href="#NT-Ignore">Ignore</a>)*</code></td></tr><tr valign="baseline"><td><a name="NT-Ignore" id="NT-Ignore"/>[65] </td><td><code>Ignore</code></td><td> ::= </td><td><code> |
1075 | in such a case the rules of XML 1.1 are applied to the entire | 1075 | <a href="#NT-Char">Char</a>* - (<a href="#NT-Char">Char</a>* |
1076 | document.</p><p> If an entity (including the document entity) is not labeled | 1076 | ('<![' | ']]>') <a href="#NT-Char">Char</a>*) </code></td></tr></tbody></table><div class="constraint"><p class="prefix"><a name="condsec-nesting" id="condsec-nesting"/><b>Validity constraint: Proper Conditional Section/PE Nesting</b></p><p>If any of the "<code><![</code>", |
1077 | with a version number, it is treated as if labeled as version | 1077 | "<code>[</code>", or "<code>]]></code>" of a conditional section is contained |
1078 | 1.0.</p></div></div><div class="div2"> <h3><a name="entproc" id="entproc" />4.4 XML Processor Treatment of Entities and References</h3><p>The table below summarizes the contexts in which character references, | 1078 | in the replacement text for a parameter-entity reference, all of them <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> |
1079 | entity references, and invocations of unparsed entities might appear and the | 1079 | be contained in the same replacement text.</p></div><p>Like the internal and external DTD subsets, a conditional section may contain |
1080 | <em class="rfc2119" title="Keyword in RFC 2119 context">REQUIRED</em> behavior of an <a title="XML Processor" href="#dt-xml-proc">XML processor</a> | 1080 | one or more complete declarations, comments, processing instructions, or nested |
1081 | in each case. The labels in the leftmost column describe the recognition context: </p><dl><dt class="label">Reference in Content</dt><dd><p>as a reference anywhere after the <a title="Start-Tag" href="#dt-stag">start-tag</a> | 1081 | conditional sections, intermingled with white space.</p><p>If the keyword of the conditional section is <b>INCLUDE</b>, then the |
1082 | and before the <a title="End Tag" href="#dt-etag">end-tag</a> of an element; corresponds | 1082 | contents of the conditional section <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> be |
1083 | to the nonterminal <a href="#NT-content">content</a>.</p></dd><dt class="label">Reference in Attribute Value</dt><dd><p>as a reference within either the value of an attribute in a <a title="Start-Tag" href="#dt-stag">start-tag</a>, | 1083 | <span>processed as</span> |
1084 | or a default value in an <a title="Attribute-List Declaration" href="#dt-attdecl">attribute declaration</a>; | 1084 | part of the DTD. If the keyword of |
1085 | corresponds to the nonterminal <a href="#NT-AttValue">AttValue</a>.</p></dd><dt class="label">Occurs as Attribute Value</dt><dd><p>as a <a href="#NT-Name">Name</a>, not a reference, appearing either as | 1085 | the conditional section is <b>IGNORE</b>, then the contents of the conditional |
1086 | the value of an attribute which has been declared as type <b>ENTITY</b>, | 1086 | section <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> |
1087 | or as one of the space-separated tokens in the value of an attribute which | 1087 | |
1088 | has been declared as type <b>ENTITIES</b>.</p></dd><dt class="label">Reference in Entity Value</dt><dd><p>as a reference within a parameter or internal entity's <a title="Literal Entity Value" href="#dt-litentval">literal | 1088 | <span> |
1089 | entity value</a> in the entity's declaration; corresponds to the nonterminal <a href="#NT-EntityValue">EntityValue</a>.</p></dd><dt class="label">Reference in DTD</dt><dd><p>as a reference within either the internal or external subsets of the <a title="Document Type Declaration" href="#dt-doctype">DTD</a>, but outside of an <a href="#NT-EntityValue">EntityValue</a>, <a href="#NT-AttValue">AttValue</a>, <a href="#NT-PI">PI</a>, <a href="#NT-Comment">Comment</a>, <a href="#NT-SystemLiteral">SystemLiteral</a>, <a href="#NT-PubidLiteral">PubidLiteral</a>, | 1089 | <em class="rfc2119" title="Keyword in RFC 2119 context">NOT</em> be processed as</span> part of the DTD. |
1090 | or the contents of an ignored conditional section (see <a href="#sec-condition-sect"><b>3.4 Conditional Sections</b></a>).</p><p>.</p></dd></dl><p></p><table border="1" frame="border" cellpadding="7" summary="Entity type/reference matrix"><tbody align="center"><tr><td rowspan="2" colspan="1"></td><td colspan="4" align="center" valign="bottom" rowspan="1">Entity | 1090 | If a conditional section with a keyword of <b>INCLUDE</b> occurs within |
1091 | Type</td><td rowspan="2" align="center" colspan="1">Character</td></tr><tr align="center" valign="bottom"><td rowspan="1" colspan="1">Parameter</td><td rowspan="1" colspan="1">Internal General</td><td rowspan="1" colspan="1">External Parsed | 1091 | a larger conditional section with a keyword of <b>IGNORE</b>, both the outer |
1092 | General</td><td rowspan="1" colspan="1">Unparsed</td></tr><tr align="center" valign="middle"><td align="right" rowspan="1" colspan="1">Reference | 1092 | and the inner conditional sections <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> be ignored. The contents |
1093 | in Content</td><td rowspan="1" colspan="1"><a href="#not-recognized"><cite>Not recognized</cite></a></td><td rowspan="1" colspan="1"><a href="#included"><cite>Included</cite></a></td><td rowspan="1" colspan="1"><a href="#include-if-valid"><cite>Included | 1093 | of an ignored conditional section <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> be parsed by ignoring all characters after |
1094 | if validating</cite></a></td><td rowspan="1" colspan="1"><a href="#forbidden"><cite>Forbidden</cite></a></td><td rowspan="1" colspan="1"><a href="#included"><cite>Included</cite></a></td></tr><tr align="center" valign="middle"><td align="right" rowspan="1" colspan="1">Reference in Attribute Value</td><td rowspan="1" colspan="1"><a href="#not-recognized"><cite>Not recognized</cite></a></td><td rowspan="1" colspan="1"><a href="#inliteral"><cite>Included | 1094 | the "<code>[</code>" following the keyword, except conditional section starts |
1095 | in literal</cite></a></td><td rowspan="1" colspan="1"><a href="#forbidden"><cite>Forbidden</cite></a></td><td rowspan="1" colspan="1"><a href="#forbidden"><cite>Forbidden</cite></a></td><td rowspan="1" colspan="1"><a href="#included"><cite>Included</cite></a></td></tr><tr align="center" valign="middle"><td align="right" rowspan="1" colspan="1">Occurs as Attribute | 1095 | "<code><![</code>" and ends "<code>]]></code>", until the matching conditional |
1096 | Value</td><td rowspan="1" colspan="1"><a href="#not-recognized"><cite>Not recognized</cite></a></td><td rowspan="1" colspan="1"><a href="#forbidden"><cite>Forbidden</cite></a></td><td rowspan="1" colspan="1"><a href="#forbidden"><cite>Forbidden</cite></a></td><td rowspan="1" colspan="1"><a href="#notify"><cite>Notify</cite></a></td><td rowspan="1" colspan="1"><a href="#not-recognized"><cite>Not recognized</cite></a></td></tr><tr align="center" valign="middle"><td align="right" rowspan="1" colspan="1">Reference in EntityValue</td><td rowspan="1" colspan="1"><a href="#inliteral"><cite>Included in literal</cite></a></td><td rowspan="1" colspan="1"><a href="#bypass"><cite>Bypassed</cite></a></td><td rowspan="1" colspan="1"><a href="#bypass"><cite>Bypassed</cite></a></td><td rowspan="1" colspan="1"><a href="#error"><cite><span>Error</span></cite></a></td><td rowspan="1" colspan="1"><a href="#included"><cite>Included</cite></a></td></tr><tr align="center" valign="middle"><td align="right" rowspan="1" colspan="1">Reference in DTD</td><td rowspan="1" colspan="1"><a href="#as-PE"><cite>Included as PE</cite></a></td><td rowspan="1" colspan="1"><a href="#forbidden"><cite>Forbidden</cite></a></td><td rowspan="1" colspan="1"><a href="#forbidden"><cite>Forbidden</cite></a></td><td rowspan="1" colspan="1"><a href="#forbidden"><cite>Forbidden</cite></a></td><td rowspan="1" colspan="1"><a href="#forbidden"><cite>Forbidden</cite></a></td></tr></tbody></table><div class="div3"> <h4><a name="not-recognized" id="not-recognized" />4.4.1 Not Recognized</h4><p>Outside the DTD, the <code>%</code> character has no special significance; | 1096 | section end is found. Parameter entity references <em class="rfc2119" title="Keyword in RFC 2119 context">MUST NOT</em> be recognized in this |
1097 | thus, what would be parameter entity references in the DTD are not recognized | 1097 | process.</p><p>If the keyword of the conditional section is a parameter-entity reference, |
1098 | as markup in <a href="#NT-content">content</a>. Similarly, the names of unparsed | 1098 | the parameter entity <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> be replaced by its content before the processor |
1099 | entities are not recognized except when they appear in the value of an appropriately | 1099 | decides whether to include or ignore the conditional section.</p><p>An example:</p><div class="exampleInner"><pre><!ENTITY % draft 'INCLUDE' > |
1100 | declared attribute.</p></div><div class="div3"> <h4><a name="included" id="included" />4.4.2 Included</h4><p>[<a name="dt-include" id="dt-include" title="Include">Definition</a>: An entity is <b>included</b> | 1100 | <!ENTITY % final 'IGNORE' > |
1101 | when its <a title="Replacement Text" href="#dt-repltext">replacement text</a> is retrieved | 1101 | <![%draft;[ |
1102 | and processed, in place of the reference itself, as though it were part of | 1102 | <!ELEMENT book (comments*, title, body, supplements?)> |
1103 | the document at the location the reference was recognized.] The replacement | 1103 | ]]> |
1104 | text <em class="rfc2119" title="Keyword in RFC 2119 context">MAY</em> contain both <a title="Character Data" href="#dt-chardata">character data</a> | 1104 | <![%final;[ |
1105 | and (except for parameter entities) <a title="Markup" href="#dt-markup">markup</a>, | 1105 | <!ELEMENT book (title, body, supplements?)> |
1106 | which <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> be recognized in the usual way. (The string "<code>AT&amp;T;</code>" | 1106 | ]]></pre></div></div></div><div class="div1"> <h2><a name="sec-physical-struct" id="sec-physical-struct"/>4 Physical Structures</h2><p> |
1107 | expands to "<code>AT&T;</code>" and the remaining ampersand | 1107 | [<a name="dt-entity" id="dt-entity" title="Entity">Definition</a>: An XML document may consist of one |
1108 | is not recognized as an entity-reference delimiter.) A character reference | 1108 | or many storage units. These |
1109 | is <b>included</b> when the indicated character is processed in place | 1109 | are called <b>entities</b>; they all have <b>content</b> and are |
1110 | of the reference itself. </p></div><div class="div3"> <h4><a name="include-if-valid" id="include-if-valid" />4.4.3 Included If Validating</h4><p>When an XML processor recognizes a reference to a parsed entity, in order | 1110 | all (except for the <a title="Document Entity" href="#dt-docent">document entity</a> and |
1111 | to <a title="Validity" href="#dt-valid">validate</a> the document, the processor | 1111 | the <a title="Document Type Declaration" href="#dt-doctype">external DTD subset</a>) identified by |
1112 | <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> <a title="Include" href="#dt-include">include</a> its replacement text. If | 1112 | entity <b>name</b>.] Each XML document has one entity |
1113 | the entity is external, and the processor is not attempting to validate the | 1113 | called the <a title="Document Entity" href="#dt-docent">document entity</a>, which serves |
1114 | XML document, the processor <em class="rfc2119" title="Keyword in RFC 2119 context">MAY</em>, but need | 1114 | as the starting point for the <a title="XML Processor" href="#dt-xml-proc">XML processor</a> |
1115 | not, include the entity's replacement text. If a non-validating processor | 1115 | and may contain the whole document.</p><p>Entities may be either parsed or unparsed. [<a name="dt-parsedent" id="dt-parsedent" title="Text Entity">Definition</a>: The contents of a <b>parsed |
1116 | does not include the replacement text, it <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> inform the application that | 1116 | entity</b> are referred to as its <a title="Replacement Text" href="#dt-repltext">replacement |
1117 | it recognized, but did not read, the entity.</p><p>This rule is based on the recognition that the automatic inclusion provided | 1117 | text</a>; this <a title="Text" href="#dt-text">text</a> is considered an |
1118 | by the SGML and XML entity mechanism, primarily designed to support modularity | 1118 | integral part of the document.] |
1119 | in authoring, is not necessarily appropriate for other applications, in particular | 1119 | </p><p> |
1120 | document browsing. Browsers, for example, when encountering an external parsed | 1120 | [<a name="dt-unparsed" id="dt-unparsed" title="Unparsed Entity">Definition</a>: An <b>unparsed entity</b> |
1121 | entity reference, might choose to provide a visual indication of the entity's | 1121 | is a resource whose contents may or may not be <a title="Text" href="#dt-text">text</a>, |
1122 | presence and retrieve it for display only on demand.</p></div><div class="div3"> <h4><a name="forbidden" id="forbidden" />4.4.4 Forbidden</h4><p>The following are forbidden, and constitute <a title="Fatal Error" href="#dt-fatal">fatal | 1122 | and if text, may |
1123 | errors</a>:</p><ul><li><p>the appearance of a reference to an <a title="Unparsed Entity" href="#dt-unparsed">unparsed | 1123 | be other than XML. Each unparsed entity has an associated <a title="Notation" href="#dt-notation">notation</a>, identified by name. Beyond a requirement |
1124 | entity</a><span>, except in the | 1124 | that an XML processor make the identifiers for the entity and notation available |
1125 | <a href="#NT-EntityValue">EntityValue</a> in an entity declaration</span>.</p></li><li><p>the appearance of any character or general-entity reference in the | 1125 | to the application, XML places no constraints on the contents of unparsed |
1126 | DTD except within an <a href="#NT-EntityValue">EntityValue</a> or <a href="#NT-AttValue">AttValue</a>.</p></li><li><p>a reference to an external entity in an attribute value.</p></li></ul></div><div class="div3"> <h4><a name="inliteral" id="inliteral" />4.4.5 Included in Literal</h4><p>When an <a title="Entity Reference" href="#dt-entref">entity reference</a> appears in | 1126 | entities.] |
1127 | an attribute value, or a parameter entity reference appears in a literal entity | 1127 | </p><p>Parsed entities are invoked by name using entity references; unparsed entities |
1128 | value, its <a title="Replacement Text" href="#dt-repltext">replacement text</a> <span class="mustard"><em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> be</span> processed | 1128 | by name, given in the value of <b>ENTITY</b> or <b>ENTITIES</b> attributes.</p><p> |
1129 | in place of the reference itself as though it were part of the document at | 1129 | [<a name="gen-entity" id="gen-entity" title="general entity">Definition</a>: |
1130 | the location the reference was recognized, except that a single or double | 1130 | <b>General entities</b> |
1131 | quote character in the replacement text <span class="mustard"><em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> always be</span> treated as a normal data | 1131 | are entities for use within the document content. In this specification, general |
1132 | character and <span class="mustard"><em class="rfc2119" title="Keyword in RFC 2119 context">MUST NOT</em></span> terminate the literal. For example, this is well-formed:</p><div class="exampleInner"><pre><!ENTITY % YN '"Yes"' > | 1132 | entities are sometimes referred to with the unqualified term <em>entity</em> |
1133 | <!ENTITY WhatHeSaid "He said %YN;" ></pre></div><p>while this is not:</p><div class="exampleInner"><pre><!ENTITY EndAttr "27'" > | 1133 | when this leads to no ambiguity.] |
1134 | <element attribute='a-&EndAttr;></pre></div></div><div class="div3"> <h4><a name="notify" id="notify" />4.4.6 Notify</h4><p>When the name of an <a title="Unparsed Entity" href="#dt-unparsed">unparsed entity</a> | 1134 | [<a name="dt-PE" id="dt-PE" title="Parameter entity">Definition</a>: |
1135 | appears as a token in the value of an attribute of declared type <b>ENTITY</b> | 1135 | <b>Parameter |
1136 | or <b>ENTITIES</b>, a validating processor <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> inform the application of | 1136 | entities</b> are parsed entities for use within the DTD.] |
1137 | the <a title="System Identifier" href="#dt-sysid">system</a> and <a title="Public identifier" href="#dt-pubid">public</a> | 1137 | These two types of entities use different forms of reference and are recognized |
1138 | (if any) identifiers for both the entity and its associated <a title="Notation" href="#dt-notation">notation</a>.</p></div><div class="div3"> <h4><a name="bypass" id="bypass" />4.4.7 Bypassed</h4><p>When a general entity reference appears in the <a href="#NT-EntityValue">EntityValue</a> | 1138 | in different contexts. Furthermore, they occupy different namespaces; a parameter |
1139 | in an entity declaration, it <span class="mustard"><em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> be</span> bypassed and left as is.</p></div><div class="div3"> <h4><a name="as-PE" id="as-PE" />4.4.8 Included as PE</h4><p>Just as with external parsed entities, parameter entities need only be <a href="#include-if-valid"><cite>included if validating</cite></a>. When a parameter-entity | 1139 | entity and a general entity with the same name are two distinct entities.</p><div class="div2"> <h3><a name="sec-references" id="sec-references"/>4.1 Character and Entity References</h3><p> |
1140 | reference is recognized in the DTD and included, its <a title="Replacement Text" href="#dt-repltext">replacement | 1140 | [<a name="dt-charref" id="dt-charref" title="Character Reference">Definition</a>: A <b>character |
1141 | text</a> <span class="mustard"><em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> be</span> enlarged by the attachment of one leading and one following | 1141 | reference</b> refers to a specific character in the ISO/IEC 10646 character |
1142 | space (#x20) character; the intent is to constrain the replacement text of | 1142 | set, for example one not directly accessible from available input devices.] |
1143 | parameter entities to contain an integral number of grammatical tokens in | 1143 | </p> <h5><a name="IDAOKKS" id="IDAOKKS"/>Character Reference</h5><table class="scrap" summary="Scrap"><tbody><tr valign="baseline"><td><a name="NT-CharRef" id="NT-CharRef"/>[66] </td><td><code>CharRef</code></td><td> ::= </td><td><code>'&#' [0-9]+ ';' </code></td></tr><tr valign="baseline"><td/><td/><td/><td><code>| '&#x' [0-9a-fA-F]+ ';'</code></td><td><a href="#wf-Legalchar">[WFC: Legal Character]</a></td></tr></tbody></table><div class="constraint"><p class="prefix"><a name="wf-Legalchar" id="wf-Legalchar"/><b>Well-formedness constraint: Legal Character</b></p><p>Characters referred |
1144 | the DTD. This | 1144 | to using character references <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> match the production for <a href="#NT-Char">Char</a>.</p></div><p>If the character reference begins with "<code>&#x</code>", |
1145 | behavior <span class="mustard"><em class="rfc2119" title="Keyword in RFC 2119 context">MUST NOT</em></span> apply to parameter entity references within entity values; | 1145 | the digits and letters up to the terminating <code>;</code> provide a hexadecimal |
1146 | these are described in <a href="#inliteral"><b>4.4.5 Included in Literal</b></a>.</p></div><div class="div3"> <h4><a name="error" id="error" />4.4.9 Error</h4><p>It is an <a title="Error" href="#dt-error">error</a> for a reference to | 1146 | representation of the character's code point in ISO/IEC 10646. If it begins |
1147 | an unparsed entity to appear in the <a href="#NT-EntityValue">EntityValue</a> in an | 1147 | just with "<code>&#</code>", the digits up to the terminating <code>;</code> |
1148 | entity declaration.</p></div></div><div class="div2"> <h3><a name="intern-replacement" id="intern-replacement" />4.5 Construction of Entity Replacement Text</h3><p>In discussing the treatment of entities, it is useful to distinguish | 1148 | provide a decimal representation of the character's code point.</p><p> |
1149 | two forms of the entity's value. | 1149 | [<a name="dt-entref" id="dt-entref" title="Entity Reference">Definition</a>: An <b>entity reference</b> |
1150 | [<a name="dt-litentval" id="dt-litentval" title="Literal Entity Value">Definition</a>: <span>For an | 1150 | refers to the content of a named entity.] |
1151 | internal entity, </span>the <b>literal | 1151 | [<a name="dt-GERef" id="dt-GERef" title="General Entity Reference">Definition</a>: References to parsed general entities use |
1152 | entity value</b> is the quoted string actually present in the entity declaration, | 1152 | ampersand (<code>&</code>) and semicolon (<code>;</code>) as delimiters.] |
1153 | corresponding to the non-terminal <a href="#NT-EntityValue">EntityValue</a>.] [<a name="dt-extlitentval" id="dt-extlitentval" title="Literal Entity Value">Definition</a>: For an external entity, the <b>literal | 1153 | [<a name="dt-PERef" id="dt-PERef" title="Parameter-entity reference">Definition</a>: |
1154 | entity value</b> is the exact text contained in the entity.] [<a name="dt-repltext" id="dt-repltext" title="Replacement Text">Definition</a>: <span>For an | 1154 | <b>Parameter-entity references</b> |
1155 | internal entity, </span>the <b>replacement text</b> | 1155 | use percent-sign (<code>%</code>) and semicolon (<code>;</code>) as delimiters.] |
1156 | is the content of the entity, after replacement of character references and | 1156 | </p> <h5><a name="IDA1MKS" id="IDA1MKS"/>Entity Reference</h5><table class="scrap" summary="Scrap"><tbody><tr valign="baseline"><td><a name="NT-Reference" id="NT-Reference"/>[67] </td><td><code>Reference</code></td><td> ::= </td><td><code> |
1157 | parameter-entity references.] [<a name="dt-extrepltext" id="dt-extrepltext" title="Replacement Text">Definition</a>: For | 1157 | <a href="#NT-EntityRef">EntityRef</a> | <a href="#NT-CharRef">CharRef</a> |
1158 | an external entity, the <b>replacement text</b> is the content of the entity, | 1158 | </code></td></tr></tbody><tbody><tr valign="baseline"><td><a name="NT-EntityRef" id="NT-EntityRef"/>[68] </td><td><code>EntityRef</code></td><td> ::= </td><td><code>'&' <a href="#NT-Name">Name</a> ';'</code></td><td><a href="#wf-entdeclared">[WFC: Entity Declared]</a></td></tr><tr valign="baseline"><td/><td/><td/><td/><td><a href="#vc-entdeclared">[VC: Entity Declared]</a></td></tr><tr valign="baseline"><td/><td/><td/><td/><td><a href="#textent">[WFC: Parsed Entity]</a></td></tr><tr valign="baseline"><td/><td/><td/><td/><td><a href="#norecursion">[WFC: No Recursion]</a></td></tr></tbody><tbody><tr valign="baseline"><td><a name="NT-PEReference" id="NT-PEReference"/>[69] </td><td><code>PEReference</code></td><td> ::= </td><td><code>'%' <a href="#NT-Name">Name</a> ';'</code></td><td><a href="#vc-entdeclared">[VC: Entity Declared]</a></td></tr><tr valign="baseline"><td/><td/><td/><td/><td><a href="#norecursion">[WFC: No Recursion]</a></td></tr><tr valign="baseline"><td/><td/><td/><td/><td><a href="#indtd">[WFC: In DTD]</a></td></tr></tbody></table><div class="constraint"><p class="prefix"><a name="wf-entdeclared" id="wf-entdeclared"/><b>Well-formedness constraint: Entity Declared</b></p><p>In a document |
1159 | after stripping the text declaration (leaving any surrounding white space) if there | 1159 | without any DTD, a document with only an internal DTD subset which contains |
1160 | is one but without any replacement of character references or parameter-entity | 1160 | no parameter entity references, or a document with "<code>standalone='yes'</code>", for |
1161 | references.]</p><p>The literal entity value as given in an internal entity declaration (<a href="#NT-EntityValue">EntityValue</a>) <em class="rfc2119" title="Keyword in RFC 2119 context">MAY</em> contain character, parameter-entity, | 1161 | an entity reference that does not occur within the external subset or a parameter |
1162 | and general-entity references. Such references <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> be contained entirely | 1162 | entity, the <a href="#NT-Name">Name</a> given in the entity reference <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> |
1163 | within the literal entity value. The actual replacement text that is <a title="Include" href="#dt-include">included</a><span> (or <a title="" href="#inliteral">included in literal</a>)</span> as described above | 1163 | <a title="match" href="#dt-match">match</a> that in an <a href="#sec-entity-decl"><cite>entity |
1164 | <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> contain the <em>replacement | 1164 | declaration</cite></a> that does not occur within the external subset or a |
1165 | text</em> of any parameter entities referred to, and <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> contain the character | 1165 | parameter entity, except that well-formed documents need not declare |
1166 | referred to, in place of any character references in the literal entity value; | 1166 | any of the following entities: <code>amp</code>, |
1167 | however, general-entity references <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> be left as-is, unexpanded. For example, | 1167 | <code>lt</code>, |
1168 | given the following declarations:</p><div class="exampleInner"><pre><!ENTITY % pub "&#xc9;ditions Gallimard" > | 1168 | <code>gt</code>, |
1169 | <!ENTITY rights "All rights reserved" > | 1169 | <code>apos</code>, |
1170 | <!ENTITY book "La Peste: Albert Camus, | 1170 | <code>quot</code>. The |
1171 | &#xA9; 1947 %pub;. &rights;" ></pre></div><p>then the replacement text for the entity "<code>book</code>" | 1171 | declaration of a general entity <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> precede any reference to it which appears |
1172 | is:</p><div class="exampleInner"><pre>La Peste: Albert Camus, | 1172 | in a default value in an attribute-list declaration.</p><p>Note |
1173 | © 1947 Éditions Gallimard. &rights;</pre></div><p>The general-entity reference "<code>&rights;</code>" would | 1173 | that non-validating processors are <a href="#include-if-valid"><cite>not |
1174 | be expanded should the reference "<code>&book;</code>" appear | 1174 | obligated to</cite></a> to read and process entity declarations occurring in parameter entities or in |
1175 | in the document's content or an attribute value.</p><p>These simple rules may have complex interactions; for a detailed discussion | 1175 | the external subset; for such documents, |
1176 | of a difficult example, see <a href="#sec-entexpand"><b>C Expansion of Entity and Character References</b></a>.</p></div><div class="div2"> <h3><a name="sec-predefined-ent" id="sec-predefined-ent" />4.6 Predefined Entities</h3><p>[<a name="dt-escape" id="dt-escape" title="escape">Definition</a>: Entity and character references <em class="rfc2119" title="Keyword in RFC 2119 context">MAY</em> | 1176 | the rule that an entity must be declared is a well-formedness constraint only |
1177 | both be used to <b>escape</b> the left angle bracket, ampersand, and | 1177 | if <a href="#sec-rmd"><cite>standalone='yes'</cite></a>.</p></div><div class="constraint"><p class="prefix"><a name="vc-entdeclared" id="vc-entdeclared"/><b>Validity constraint: Entity Declared</b></p><p>In a document with an external subset or parameter |
1178 | other delimiters. A set of general entities (<code>amp</code>, | 1178 | <span> |
1179 | <code>lt</code>, | 1179 | entity references</span> with |
1180 | <code>gt</code>, | 1180 | "<code>standalone='no'</code>", |
1181 | <code>apos</code>, | 1181 | the <a href="#NT-Name">Name</a> given in the entity reference <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> |
1182 | <code>quot</code>) is specified for | 1182 | <a title="match" href="#dt-match">match</a> that in an <a href="#sec-entity-decl"><cite>entity |
1183 | this purpose. Numeric character references <em class="rfc2119" title="Keyword in RFC 2119 context">MAY</em> also be used; they are expanded | 1183 | declaration</cite></a>. For interoperability, valid documents <em class="rfc2119" title="Keyword in RFC 2119 context">SHOULD</em> declare |
1184 | immediately when recognized and <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> be treated as character data, so the | 1184 | the entities <code>amp</code>, |
1185 | numeric character references "<code>&#60;</code>" and "<code>&#38;</code>" <em class="rfc2119" title="Keyword in RFC 2119 context">MAY</em> be used to escape <code><</code> and <code>&</code> when they occur | 1185 | <code>lt</code>, |
1186 | in character data.]</p><p>All XML processors <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> recognize these entities whether they are declared | 1186 | <code>gt</code>, |
1187 | or not. <a title="For interoperability" href="#dt-interop">For interoperability</a>, valid XML | 1187 | <code>apos</code>, |
1188 | documents <em class="rfc2119" title="Keyword in RFC 2119 context">SHOULD</em> declare these entities, like any others, before using them. If | 1188 | <code>quot</code>, in the form specified in <a href="#sec-predefined-ent"><b>4.6 Predefined Entities</b></a>. |
1189 | the entities <code>lt</code> or <code>amp</code> are declared, they <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> be | 1189 | The declaration of a parameter entity <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> precede any reference to it. Similarly, |
1190 | declared as internal entities whose replacement text is a character reference | 1190 | the declaration of a general entity <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> precede any attribute-list |
1191 | to the respective | 1191 | declaration containing a default value with a direct or indirect reference |
1192 | character (less-than sign or ampersand) being escaped; the double | 1192 | to that general entity.</p></div><div class="constraint"><p class="prefix"><a name="textent" id="textent"/><b>Well-formedness constraint: Parsed Entity</b></p><p>An entity reference <em class="rfc2119" title="Keyword in RFC 2119 context">MUST |
1193 | escaping is <em class="rfc2119" title="Keyword in RFC 2119 context">REQUIRED</em> for these entities so that references to them produce | 1193 | NOT</em> contain the name of an <a title="Unparsed Entity" href="#dt-unparsed">unparsed entity</a>. |
1194 | a well-formed result. If the entities <code>gt</code>, <code>apos</code>, | 1194 | Unparsed entities may be referred to only in <a title="Attribute Value" href="#dt-attrval">attribute |
1195 | or <code>quot</code> are declared, they <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> be declared as internal entities | 1195 | values</a> declared to be of type <b>ENTITY</b> or <b>ENTITIES</b>.</p></div><div class="constraint"><p class="prefix"><a name="norecursion" id="norecursion"/><b>Well-formedness constraint: No Recursion</b></p><p>A parsed entity <em class="rfc2119" title="Keyword in RFC 2119 context">MUST NOT</em> contain a recursive reference to itself, either directly or indirectly.</p></div><div class="constraint"><p class="prefix"><a name="indtd" id="indtd"/><b>Well-formedness constraint: In DTD</b></p><p>Parameter-entity references <em class="rfc2119" title="Keyword in RFC 2119 context">MUST NOT</em> appear outside |
1196 | whose replacement text is the single character being escaped (or a character | 1196 | the <a title="Document Type Declaration" href="#dt-doctype">DTD</a>.</p></div><p>Examples of character and entity references:</p><div class="exampleInner"><pre>Type <key>less-than</key> (&#x3C;) to save options. |
1197 | reference to that character; the double escaping here is <span class="mustard"><em class="rfc2119" title="Keyword in RFC 2119 context">OPTIONAL</em></span> but harmless). | 1197 | This document was prepared on &docdate; and |
1198 | For example:</p><div class="exampleInner"><pre><!ENTITY lt "&#38;#60;"> | 1198 | is classified &security-level;.</pre></div><p>Example of a parameter-entity reference:</p><div class="exampleInner"><pre><!-- declare the parameter entity "ISOLat2"... --> |
1199 | <!ENTITY gt "&#62;"> | 1199 | <!ENTITY % ISOLat2 |
1200 | <!ENTITY amp "&#38;#38;"> | 1200 | SYSTEM "http://www.xml.com/iso/isolat2-xml.entities" > |
1201 | <!ENTITY apos "&#39;"> | 1201 | <!-- ... now reference it. --> |
1202 | <!ENTITY quot "&#34;"></pre></div></div><div class="div2"> <h3><a name="Notations" id="Notations" />4.7 Notation Declarations</h3><p>[<a name="dt-notation" id="dt-notation" title="Notation">Definition</a>: <b>Notations</b> identify | 1202 | %ISOLat2;</pre></div></div><div class="div2"> <h3><a name="sec-entity-decl" id="sec-entity-decl"/>4.2 Entity Declarations</h3><p> |
1203 | by name the format of <a title="External Entity" href="#dt-extent">unparsed entities</a>, | 1203 | [<a name="dt-entdecl" id="dt-entdecl" title="entity declaration">Definition</a>: Entities are declared |
1204 | the format of elements which bear a notation attribute, or the application | 1204 | thus:] |
1205 | to which a <a title="Processing instruction" href="#dt-pi">processing instruction</a> is addressed.]</p><p>[<a name="dt-notdecl" id="dt-notdecl" title="Notation Declaration">Definition</a>: <b>Notation declarations</b> | 1205 | </p> <h5><a name="IDABVKS" id="IDABVKS"/>Entity Declaration</h5><table class="scrap" summary="Scrap"><tbody><tr valign="baseline"><td><a name="NT-EntityDecl" id="NT-EntityDecl"/>[70] </td><td><code>EntityDecl</code></td><td> ::= </td><td><code> |
1206 | provide a name for the notation, for use in entity and attribute-list declarations | 1206 | <a href="#NT-GEDecl">GEDecl</a> |
1207 | and in attribute specifications, and an external identifier for the notation | 1207 | | <a href="#NT-PEDecl">PEDecl</a> |
1208 | which may allow an XML processor or its client application to locate a helper | 1208 | </code></td></tr><tr valign="baseline"><td><a name="NT-GEDecl" id="NT-GEDecl"/>[71] </td><td><code>GEDecl</code></td><td> ::= </td><td><code>'<!ENTITY' <a href="#NT-S">S</a> |
1209 | application capable of processing data in the given notation.]</p> <h5><a name="IDAYTFU" id="IDAYTFU" />Notation Declarations</h5><table class="scrap" summary="Scrap"><tbody><tr valign="baseline"><td><a name="NT-NotationDecl" id="NT-NotationDecl" />[82] </td><td><code>NotationDecl</code></td><td> ::= </td><td><code>'<!NOTATION' <a href="#NT-S">S</a> <a href="#NT-Name">Name</a> <a href="#NT-S">S</a> (<a href="#NT-ExternalID">ExternalID</a> | <a href="#NT-PublicID">PublicID</a>) <a href="#NT-S">S</a>? '>'</code></td><td><a href="#UniqueNotationName">[VC: Unique Notation Name]</a></td></tr></tbody><tbody><tr valign="baseline"><td><a name="NT-PublicID" id="NT-PublicID" />[83] </td><td><code>PublicID</code></td><td> ::= </td><td><code>'PUBLIC' <a href="#NT-S">S</a> <a href="#NT-PubidLiteral">PubidLiteral</a></code></td></tr></tbody></table><div class="constraint"><p class="prefix"><a name="UniqueNotationName" id="UniqueNotationName" /><b>Validity constraint: Unique Notation Name</b></p><p><span class="mustard">A given <a href="#NT-Name">Name</a> <em class="rfc2119" title="Keyword in RFC 2119 context">MUST NOT</em> be declared in more than one notation declaration.</span></p></div><p>XML processors <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> provide applications with the name and external identifier(s) | 1209 | <a href="#NT-Name">Name</a> |
1210 | of any notation declared and referred to in an attribute value, attribute | 1210 | <a href="#NT-S">S</a> |
1211 | definition, or entity declaration. They <em class="rfc2119" title="Keyword in RFC 2119 context">MAY</em> additionally resolve the external | 1211 | <a href="#NT-EntityDef">EntityDef</a> |
1212 | identifier into the <a title="System Identifier" href="#dt-sysid">system identifier</a>, file | 1212 | <a href="#NT-S">S</a>? |
1213 | name, or other information needed to allow the application to call a processor | 1213 | '>'</code></td></tr><tr valign="baseline"><td><a name="NT-PEDecl" id="NT-PEDecl"/>[72] </td><td><code>PEDecl</code></td><td> ::= </td><td><code>'<!ENTITY' <a href="#NT-S">S</a> '%' <a href="#NT-S">S</a> |
1214 | for data in the notation described. (It is not an error, however, for XML | 1214 | <a href="#NT-Name">Name</a> |
1215 | documents to declare and refer to notations for which notation-specific applications | 1215 | <a href="#NT-S">S</a> |
1216 | are not available on the system where the XML processor or application is | 1216 | <a href="#NT-PEDef">PEDef</a> |
1217 | running.)</p></div><div class="div2"> <h3><a name="sec-doc-entity" id="sec-doc-entity" />4.8 Document Entity</h3><p>[<a name="dt-docent" id="dt-docent" title="Document Entity">Definition</a>: The <b>document entity</b> | 1217 | <a href="#NT-S">S</a>? '>'</code></td></tr><tr valign="baseline"><td><a name="NT-EntityDef" id="NT-EntityDef"/>[73] </td><td><code>EntityDef</code></td><td> ::= </td><td><code> |
1218 | serves as the root of the entity tree and a starting-point for an <a title="XML Processor" href="#dt-xml-proc">XML processor</a>.] This specification does | 1218 | <a href="#NT-EntityValue">EntityValue</a> |
1219 | not specify how the document entity is to be located by an XML processor; | 1219 | | (<a href="#NT-ExternalID">ExternalID</a> |
1220 | unlike other entities, the document entity has no name and might well appear | 1220 | <a href="#NT-NDataDecl">NDataDecl</a>?)</code></td></tr><tr valign="baseline"><td><a name="NT-PEDef" id="NT-PEDef"/>[74] </td><td><code>PEDef</code></td><td> ::= </td><td><code> |
1221 | on a processor input stream without any identification at all.</p></div></div><div class="div1"> <h2><a name="sec-conformance" id="sec-conformance" />5 Conformance</h2><div class="div2"> <h3><a name="proc-types" id="proc-types" />5.1 Validating and Non-Validating Processors</h3><p>Conforming <a title="XML Processor" href="#dt-xml-proc">XML processors</a> fall into | 1221 | <a href="#NT-EntityValue">EntityValue</a> | <a href="#NT-ExternalID">ExternalID</a> |
1222 | two classes: validating and non-validating.</p><p>Validating and non-validating processors alike <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> report violations of | 1222 | </code></td></tr></tbody></table><p>The <a href="#NT-Name">Name</a> identifies the entity in an <a title="Entity Reference" href="#dt-entref">entity |
1223 | this specification's well-formedness constraints in the content of the <a title="Document Entity" href="#dt-docent">document entity</a> and any other <a title="Text Entity" href="#dt-parsedent">parsed | 1223 | reference</a> or, in the case of an unparsed entity, in the value of |
1224 | entities</a> that they read.</p><p>[<a name="dt-validating" id="dt-validating" title="Validating Processor">Definition</a>: <b>Validating | 1224 | an <b>ENTITY</b> or <b>ENTITIES</b> attribute. If the same entity is declared |
1225 | processors</b> <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em>, | 1225 | more than once, the first declaration encountered is binding; at user option, |
1226 | at user option, report violations of the constraints expressed by | 1226 | an XML processor <em class="rfc2119" title="Keyword in RFC 2119 context">MAY</em> issue a warning if entities are declared multiple times.</p><div class="div3"> <h4><a name="sec-internal-ent" id="sec-internal-ent"/>4.2.1 Internal Entities</h4><p> |
1227 | the declarations in the <a title="Document Type Declaration" href="#dt-doctype">DTD</a>, and failures | 1227 | [<a name="dt-internent" id="dt-internent" title="Internal Entity Replacement Text">Definition</a>: If the |
1228 | to fulfill the validity constraints given in this specification.] | 1228 | entity definition is an <a href="#NT-EntityValue">EntityValue</a>, the defined |
1229 | To accomplish this, validating XML processors <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> read and process the entire | 1229 | entity is called an <b>internal entity</b>. There is no separate physical |
1230 | DTD and all external parsed entities referenced in the document.</p><p>Non-validating processors are <em class="rfc2119" title="Keyword in RFC 2119 context">REQUIRED</em> to check only the <a title="Document Entity" href="#dt-docent">document | 1230 | storage object, and the content of the entity is given in the declaration.] |
1231 | entity</a>, including the entire internal DTD subset, for well-formedness. [<a name="dt-use-mdecl" id="dt-use-mdecl" title="Process Declarations">Definition</a>: While they are not required | 1231 | Note that some processing of entity and character references in the <a title="Literal Entity Value" href="#dt-litentval">literal entity value</a> may be required to produce |
1232 | to check the document for validity, they are <em class="rfc2119" title="Keyword in RFC 2119 context">REQUIRED</em> to <b>process</b> | 1232 | the correct <a title="Replacement Text" href="#dt-repltext">replacement text</a>: see <a href="#intern-replacement"><b>4.5 Construction of Entity Replacement Text</b></a>.</p><p>An internal entity is a <a title="Text Entity" href="#dt-parsedent">parsed entity</a>.</p><p>Example of an internal entity declaration:</p><div class="exampleInner"><pre><!ENTITY Pub-Status "This is a pre-release of the |
1233 | all the declarations they read in the internal DTD subset and in any parameter | 1233 | specification."></pre></div></div><div class="div3"> <h4><a name="sec-external-ent" id="sec-external-ent"/>4.2.2 External Entities</h4><p> |
1234 | entity that they read, up to the first reference to a parameter entity that | 1234 | [<a name="dt-extent" id="dt-extent" title="External Entity">Definition</a>: If the entity is not internal, |
1235 | they do <em>not</em> read; that is to say, they <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> use the information | 1235 | it is an <b>external entity</b>, declared as follows:] |
1236 | in those declarations to <a href="#AVNormalize"><cite>normalize</cite></a> | 1236 | </p> <h5><a name="IDAX1KS" id="IDAX1KS"/>External Entity Declaration</h5><table class="scrap" summary="Scrap"><tbody><tr valign="baseline"><td><a name="NT-ExternalID" id="NT-ExternalID"/>[75] </td><td><code>ExternalID</code></td><td> ::= </td><td><code>'SYSTEM' <a href="#NT-S">S</a> |
1237 | attribute values, <a href="#included"><cite>include</cite></a> the replacement | 1237 | <a href="#NT-SystemLiteral">SystemLiteral</a> |
1238 | text of internal entities, and supply <a href="#sec-attr-defaults"><cite>default | 1238 | </code></td></tr><tr valign="baseline"><td/><td/><td/><td><code>| 'PUBLIC' <a href="#NT-S">S</a> |
1239 | attribute values</cite></a>.] Except when <code>standalone="yes"</code>, they | 1239 | <a href="#NT-PubidLiteral">PubidLiteral</a> |
1240 | <em class="rfc2119" title="Keyword in RFC 2119 context">MUST NOT</em> <a title="Process Declarations" href="#dt-use-mdecl">process</a> <a title="entity declaration" href="#dt-entdecl">entity | 1240 | <a href="#NT-S">S</a> |
1241 | declarations</a> or <a title="Attribute-List Declaration" href="#dt-attdecl">attribute-list declarations</a> | 1241 | <a href="#NT-SystemLiteral">SystemLiteral</a> |
1242 | encountered after a reference to a parameter entity that is not read, since | 1242 | </code></td></tr></tbody><tbody><tr valign="baseline"><td><a name="NT-NDataDecl" id="NT-NDataDecl"/>[76] </td><td><code>NDataDecl</code></td><td> ::= </td><td><code> |
1243 | the entity may have contained overriding declarations<span>; when <code>standalone="yes"</code>, processors <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> | 1243 | <a href="#NT-S">S</a> 'NDATA' <a href="#NT-S">S</a> |
1244 | process these declarations</span>.</p><p>Note | 1244 | <a href="#NT-Name">Name</a> |
1245 | that when processing invalid documents with a non-validating | 1245 | </code></td><td><a href="#not-declared">[VC: Notation Declared]</a></td></tr></tbody></table><p>If the <a href="#NT-NDataDecl">NDataDecl</a> is present, this is a general <a title="Unparsed Entity" href="#dt-unparsed">unparsed entity</a>; otherwise it is a parsed entity.</p><div class="constraint"><p class="prefix"><a name="not-declared" id="not-declared"/><b>Validity constraint: Notation Declared</b></p><p>The <a href="#NT-Name">Name</a> |
1246 | processor the application may not be presented with consistent | 1246 | <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> match the declared name of a <a title="Notation" href="#dt-notation">notation</a>.</p></div><p> |
1247 | information. For example, several requirements for uniqueness | 1247 | [<a name="dt-sysid" id="dt-sysid" title="System Identifier">Definition</a>: The <a href="#NT-SystemLiteral">SystemLiteral</a> is called the entity's <b>system |
1248 | within the document may not be met, including more than one element | 1248 | identifier</b>. It is meant to be |
1249 | with the same id, duplicate declarations of elements or notations | 1249 | converted to a URI reference |
1250 | with the same name, etc. In these cases the behavior of the parser | 1250 | (as defined in <a href="#rfc3986">[IETF RFC 3986]</a>), |
1251 | with respect to reporting such information to the application is | 1251 | as part of the |
1252 | undefined.</p><p>XML 1.1 processors <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> be able to process both XML 1.0 | 1252 | process of dereferencing it to obtain input for the XML processor to construct the |
1253 | and XML 1.1 documents. Programs which generate XML <em class="rfc2119" title="Keyword in RFC 2119 context">SHOULD</em> | 1253 | entity's replacement text.] It is an error for a fragment identifier |
1254 | generate XML 1.0, unless one of the specific features of XML 1.1 is required.</p></div><div class="div2"> <h3><a name="safe-behavior" id="safe-behavior" />5.2 Using XML Processors</h3><p>The behavior of a validating XML processor is highly predictable; it must | 1254 | (beginning with a <code>#</code> character) to be part of a system identifier. |
1255 | read every piece of a document and report all well-formedness and validity | 1255 | Unless otherwise provided by information outside the scope of this specification |
1256 | violations. Less is required of a non-validating processor; it need not read | 1256 | (e.g. a special XML element type defined by a particular DTD, or a processing |
1257 | any part of the document other than the document entity. This has two effects | 1257 | instruction defined by a particular application specification), relative URIs |
1258 | that may be important to users of XML processors:</p><ul><li><p>Certain well-formedness errors, specifically those that require reading | 1258 | are relative to the location of the resource within which the entity declaration |
1259 | external entities, <span>may fail to</span> be detected by a non-validating processor. Examples | 1259 | occurs. This is defined to |
1260 | include the constraints entitled <a href="#wf-entdeclared"><cite>Entity Declared</cite></a>, <a href="#textent"><cite>Parsed Entity</cite></a>, and <a href="#norecursion"><cite>No | 1260 | be the external entity containing the '<' which starts the declaration, at the |
1261 | Recursion</cite></a>, as well as some of the cases described as <a href="#forbidden"><cite>forbidden</cite></a> in <a href="#entproc"><b>4.4 XML Processor Treatment of Entities and References</b></a>.</p></li><li><p>The information passed from the processor to the application may | 1261 | point when it is parsed as a declaration. |
1262 | vary, depending on whether the processor reads parameter and external entities. | 1262 | A URI might thus be relative to the <a title="Document Entity" href="#dt-docent">document |
1263 | For example, a non-validating processor <span>may fail to</span> <a href="#AVNormalize"><cite>normalize</cite></a> | 1263 | entity</a>, to the entity containing the <a title="Document Type Declaration" href="#dt-doctype">external |
1264 | attribute values, <a href="#included"><cite>include</cite></a> the replacement | 1264 | DTD subset</a>, or to some other <a title="External Entity" href="#dt-extent">external parameter |
1265 | text of internal entities, or supply <a href="#sec-attr-defaults"><cite>default | 1265 | entity</a>. Attempts to |
1266 | attribute values</cite></a>, where doing so depends on having read declarations | 1266 | retrieve the resource identified by a URI <span>may</span> be redirected at the parser |
1267 | in external or parameter entities.</p></li></ul><p>For maximum reliability in interoperating between different XML processors, | 1267 | level (for example, in an entity resolver) or below (at the protocol level, |
1268 | applications which use non-validating processors <em class="rfc2119" title="Keyword in RFC 2119 context">SHOULD NOT</em> rely on any behaviors | 1268 | for example, via an HTTP <code>Location:</code> header). In the absence of additional |
1269 | not required of such processors. Applications which require DTD facilities | 1269 | information outside the scope of this specification within the resource, |
1270 | not related to validation (such | 1270 | the base URI of a resource is always the URI of the actual resource returned. |
1271 | as the declaration of default attributes and internal entities that are | 1271 | In other words, it is the URI of the resource retrieved after all redirection |
1272 | or may be specified in | 1272 | has occurred.</p><p>System |
1273 | external entities <em class="rfc2119" title="Keyword in RFC 2119 context">SHOULD</em> use validating XML processors.</p></div></div><div class="div1"> <h2><a name="sec-notation" id="sec-notation" />6 Notation</h2><p>The formal grammar of XML is given in this specification using a simple | 1273 | identifiers (and other XML strings meant to be used as URI references) <span>may</span> contain |
1274 | Extended Backus-Naur Form (EBNF) notation. Each rule in the grammar defines | 1274 | characters that, according to <a href="#rfc3986">[IETF RFC 3986]</a>, |
1275 | one symbol, in the form</p><div class="exampleInner"><pre>symbol ::= expression</pre></div><p>Symbols are written with an initial capital letter if they are the | 1275 | must be escaped before a URI can be used to retrieve the referenced resource. The |
1276 | start symbol of a regular language, otherwise with an initial lowercase | 1276 | characters to be escaped are the control characters #x0 to #x1F and #x7F (most of |
1277 | letter. Literal strings are quoted.</p><p>Within the expression on the right-hand side of a rule, the following expressions | 1277 | which cannot appear in XML), space #x20, the delimiters '<' #x3C, '>' #x3E and |
1278 | are used to match strings of one or more characters: </p><dl><dt class="label"><code>#xN</code></dt><dd><p>where <code>N</code> is a hexadecimal integer, the expression matches the character | 1278 | '"' #x22, the <em>unwise</em> characters '{' #x7B, '}' #x7D, '|' #x7C, '\' #x5C, '^' #x5E and |
1279 | <span>whose</span><span> number | 1279 | '`' #x60, as well as all characters above #x7F. Since escaping is not always a fully |
1280 | (code point) in</span> ISO/IEC 10646 <span>is <code>N</code></span>. The number of leading zeros in the <code>#xN</code> | 1280 | reversible process, it <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> be performed only when absolutely necessary and as late |
1281 | form is insignificant.</p></dd><dt class="label"><code>[a-zA-Z]</code>, <code>[#xN-#xN]</code></dt><dd><p>matches any <a href="#NT-Char">Char</a> with a value in the range(s) indicated (inclusive).</p></dd><dt class="label"><code>[abc]</code>, <code>[#xN#xN#xN]</code></dt><dd><p>matches any <a href="#NT-Char">Char</a> with a value among the characters | 1281 | as possible in a processing chain. In particular, neither the process of converting |
1282 | enumerated. Enumerations and ranges can be mixed in one set of brackets.</p></dd><dt class="label"><code>[^a-z]</code>, <code>[^#xN-#xN]</code></dt><dd><p>matches any <a href="#NT-Char">Char</a> with a value <em>outside</em> the range | 1282 | a relative URI to an absolute one nor the process of passing a URI reference to a |
1283 | indicated.</p></dd><dt class="label"><code>[^abc]</code>, <code>[^#xN#xN#xN]</code></dt><dd><p>matches any <a href="#NT-Char">Char</a> with a value not among the characters given. Enumerations | 1283 | process or software component responsible for dereferencing it <em class="rfc2119" title="Keyword in RFC 2119 context">SHOULD</em> trigger escaping. |
1284 | and ranges of forbidden values can be mixed in one set of brackets.</p></dd><dt class="label"><code>"string"</code></dt><dd><p>matches a literal string <a title="match" href="#dt-match">matching</a> that | 1284 | When escaping does occur, it <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> be performed as follows:</p><ol class="enumar"><li><p>Each character to be escaped is represented in |
1285 | given inside the double quotes.</p></dd><dt class="label"><code>'string'</code></dt><dd><p>matches a literal string <a title="match" href="#dt-match">matching</a> that | 1285 | UTF-8 <a href="#Unicode">[Unicode]</a> |
1286 | given inside the single quotes.</p></dd></dl><p> These symbols may be combined to match more complex patterns as follows, | 1286 | as one or more bytes.</p></li><li><p>The resulting bytes are escaped with |
1287 | where <code>A</code> and <code>B</code> represent simple expressions: </p><dl><dt class="label">(<code>expression</code>)</dt><dd><p><code>expression</code> is treated as a unit and may be combined as described | 1287 | the URI escaping mechanism (that is, converted to <code>%</code><var>HH</var>, |
1288 | in this list.</p></dd><dt class="label"><code>A?</code></dt><dd><p>matches <code>A</code> or nothing; optional <code>A</code>.</p></dd><dt class="label"><code>A B</code></dt><dd><p>matches <code>A</code> followed by <code>B</code>. This | 1288 | where HH is the hexadecimal notation of the byte value).</p></li><li><p>The original character is replaced by the resulting character sequence.</p></li></ol><p> |
1289 | operator has higher precedence than alternation; thus <code>A B | C D</code> | 1289 | [<a name="dt-pubid" id="dt-pubid" title="Public identifier">Definition</a>: In addition to a system |
1290 | is identical to <code>(A B) | (C D)</code>.</p></dd><dt class="label"><code>A | B</code></dt><dd><p>matches <code>A</code> or <code>B</code>.</p></dd><dt class="label"><code>A - B</code></dt><dd><p>matches any string that matches <code>A</code> but does not match <code>B</code>.</p></dd><dt class="label"><code>A+</code></dt><dd><p>matches one or more occurrences of <code>A</code>. Concatenation | 1290 | identifier, an external identifier <span>may</span> include a <b>public identifier</b>.] |
1291 | has higher precedence than alternation; thus <code>A+ | B+</code> is identical | 1291 | An XML processor attempting to retrieve the entity's content <span>may</span> use |
1292 | to <code>(A+) | (B+)</code>.</p></dd><dt class="label"><code>A*</code></dt><dd><p>matches zero or more occurrences of <code>A</code>. Concatenation | 1292 | any combination of |
1293 | has higher precedence than alternation; thus <code>A* | B*</code> is identical | 1293 | the public and system identifiers as well as additional information outside the |
1294 | to <code>(A*) | (B*)</code>.</p></dd></dl><p> Other notations used in the productions are: </p><dl><dt class="label"><code>/* ... */</code></dt><dd><p>comment.</p></dd><dt class="label"><code>[ wfc: ... ]</code></dt><dd><p>well-formedness constraint; this identifies by name a constraint on <a title="Well-Formed" href="#dt-wellformed">well-formed</a> documents associated with a production.</p></dd><dt class="label"><code>[ vc: ... ]</code></dt><dd><p>validity constraint; this identifies by name a constraint on <a title="Validity" href="#dt-valid">valid</a> | 1294 | scope of this specification to try to generate an alternative URI reference. |
1295 | documents associated with a production.</p></dd></dl><p></p></div></div><div class="back"><div class="div1"> <h2><a name="sec-bibliography" id="sec-bibliography" />A References</h2><div class="div2"> <h3><a name="sec-existing-stds" id="sec-existing-stds" />A.1 Normative References</h3><dl><dt class="label"><a name="IANA" id="IANA" />IANA-CHARSETS</dt><dd>(Internet | 1295 | If the processor is unable to do so, it <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> use the URI |
1296 | Assigned Numbers Authority) <a href="http://www.iana.org/assignments/character-sets"><cite>Official Names for Character Sets</cite></a>, | 1296 | reference specified in the system literal. Before a match is attempted, |
1297 | ed. Keld Simonsen et al. (See http://www.iana.org/assignments/character-sets.)</dd><dt class="label"><a name="rfc2119" id="rfc2119" />IETF RFC 2119</dt><dd>IETF | 1297 | all strings of white space in the public identifier <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> be normalized to |
1298 | (Internet Engineering Task Force). <a href="http://www.ietf.org/rfc/rfc2119.txt"><cite>RFC 2119: Key words for use in RFCs to Indicate Requirement Levels</cite></a>. | 1298 | single space characters (#x20), and leading and trailing white space <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> |
1299 | Scott Bradner, 1997. (See http://www.ietf.org/rfc/rfc2119.txt.)</dd><dt class="label"><a name="rfc2396" id="rfc2396" />IETF RFC 2396</dt><dd>IETF | 1299 | be removed.</p><p>Examples of external entity declarations:</p><div class="exampleInner"><pre><!ENTITY open-hatch |
1300 | (Internet Engineering Task Force). <a href="http://www.ietf.org/rfc/rfc2396.txt"><cite>RFC 2396: Uniform Resource Identifiers | 1300 | SYSTEM "http://www.textuality.com/boilerplate/OpenHatch.xml"> |
1301 | (URI): Generic Syntax</cite></a>. T. Berners-Lee, R. Fielding, L. Masinter. | 1301 | <!ENTITY open-hatch |
1302 | 1998. (See http://www.ietf.org/rfc/rfc2396.txt.)</dd><dt class="label"><a name="rfc2732" id="rfc2732" />IETF RFC 2732</dt><dd>IETF | 1302 | PUBLIC "-//Textuality//TEXT Standard open-hatch boilerplate//EN" |
1303 | (Internet Engineering Task Force). <a href="http://www.ietf.org/rfc/rfc2732.txt"><cite>RFC 2732: Format for Literal | 1303 | "http://www.textuality.com/boilerplate/OpenHatch.xml"> |
1304 | IPv6 Addresses in URL's</cite></a>. R. Hinden, B. Carpenter, L. Masinter. | 1304 | <!ENTITY hatch-pic |
1305 | 1999. (See http://www.ietf.org/rfc/rfc2732.txt.)</dd><dt class="label"><a name="RFC1766" id="RFC1766" />IETF RFC 3066</dt><dd>IETF | 1305 | SYSTEM "../grafix/OpenHatch.gif" |
1306 | (Internet Engineering Task Force). <a href="http://www.ietf.org/rfc/rfc3066.txt"><cite>RFC 3066: Tags for the Identification | 1306 | NDATA gif ></pre></div></div></div><div class="div2"> <h3><a name="TextEntities" id="TextEntities"/>4.3 Parsed Entities</h3><div class="div3"> <h4><a name="sec-TextDecl" id="sec-TextDecl"/>4.3.1 The Text Declaration</h4><p>External parsed entities <em class="rfc2119" title="Keyword in RFC 2119 context">SHOULD</em> each begin with a <b>text declaration</b>.</p> <h5><a name="IDAGDLS" id="IDAGDLS"/>Text Declaration</h5><table class="scrap" summary="Scrap"><tbody><tr valign="baseline"><td><a name="NT-TextDecl" id="NT-TextDecl"/>[77] </td><td><code>TextDecl</code></td><td> ::= </td><td><code>'<?xml' <a href="#NT-VersionInfo">VersionInfo</a>? <a href="#NT-EncodingDecl">EncodingDecl</a> |
1307 | of Languages</cite></a>, ed. H. Alvestrand. 2001. (See http://www.ietf.org/rfc/rfc3066.txt.)</dd><dt class="label"><a name="ISO10646" id="ISO10646" />ISO/IEC 10646</dt><dd><span>ISO (International | 1307 | <a href="#NT-S">S</a>? '?>'</code></td></tr></tbody></table><p>The text declaration <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> be provided literally, not by reference |
1308 | Organization for Standardization). <cite>ISO/IEC 10646-1:2000. Information | 1308 | to a parsed entity. The text declaration |
1309 | technology — Universal Multiple-Octet Coded Character Set (UCS) — | 1309 | <em class="rfc2119" title="Keyword in RFC 2119 context">MUST NOT</em> appear at any |
1310 | Part 1: Architecture and Basic Multilingual Plane</cite> and <cite>ISO/IEC 10646-2:2001. | 1310 | position other than the beginning of an external parsed entity. The text declaration |
1311 | Information technology — Universal Multiple-Octet Coded Character Set (UCS) — Part 2: | 1311 | in an external parsed entity is not considered part of its <a title="Replacement Text" href="#dt-repltext">replacement text</a>.</p></div><div class="div3"> <h4><a name="wf-entities" id="wf-entities"/>4.3.2 Well-Formed Parsed Entities</h4><p>The document entity is well-formed if it matches the production labeled <a href="#NT-document">document</a>. An external general parsed entity is well-formed |
1312 | Supplementary Planes</cite>, as, from time to time, amended, replaced by a new edition or | 1312 | if it matches the production labeled <a href="#NT-extParsedEnt">extParsedEnt</a>. All |
1313 | expanded by the addition of new parts. [Geneva]: International Organization for Standardization. | 1313 | external parameter entities are well-formed by definition.</p><div class="note"><p class="prefix"><b>Note:</b></p><p> |
1314 | (See <a href="http://www.iso.ch">http://www.iso.ch</a> for the latest version.)</span></dd><dt class="label"><a name="Unicode" id="Unicode" />Unicode</dt><dd>The Unicode Consortium. <em>The Unicode | 1314 | Only parsed entities that are referenced directly or indirectly within the document are required to be well-formed.</p></div> <h5><a name="IDAKFLS" id="IDAKFLS"/>Well-Formed External Parsed Entity</h5><table class="scrap" summary="Scrap"><tbody><tr valign="baseline"><td><a name="NT-extParsedEnt" id="NT-extParsedEnt"/>[78] </td><td><code>extParsedEnt</code></td><td> ::= </td><td><code> |
1315 | Standard, Version 4.0.</em> Reading, Mass.: Addison-Wesley, | 1315 | <span> |
1316 | 2003, | 1316 | (</span> |
1317 | as updated from time to time by the publication of new versions. (See | 1317 | <a href="#NT-TextDecl">TextDecl</a>? <a href="#NT-content">content</a> |
1318 | <a href="http://www.unicode.org/unicode/standard/versions"> | 1318 | <span> |
1319 | http://www.unicode.org/unicode/standard/versions</a> for the latest version | 1319 | )</span> - <span> |
1320 | and additional information on versions of the standard and of the Unicode | 1320 | (</span> |
1321 | Character Database).</dd><dt class="label"><a name="XML1.0" />XML-1.0</dt><dd>W3C. <a href="http://www.w3.org/TR/REC-xml"><cite>Extensible Markup Language (XML) 1.0 (Third | 1321 | <a href="#NT-Char">Char</a>* <a href="#NT-RestrictedChar">RestrictedChar</a> |
1322 | Edition)</cite></a>. Tim Bray, Jean Paoli, C.M. Sperberg-McQueen, Eve Maler, François Yergeau | 1322 | <a href="#NT-Char">Char</a>*<span> |
1323 | (editors) (See http://www.w3.org/TR/REC-xml.)</dd></dl></div><div class="div2"> <h3><a name="null" id="null" />A.2 Other References</h3><dl><dt class="label"><a name="Aho" id="Aho" />Aho/Ullman</dt><dd>Aho, Alfred V., Ravi Sethi, and Jeffrey D. | 1323 | )</span> |
1324 | Ullman. <cite>Compilers: Principles, Techniques, and Tools</cite>. | 1324 | </code></td></tr></tbody></table><p>An internal general parsed entity is well-formed if its replacement text |
1325 | Reading: Addison-Wesley, 1986, rpt. corr. 1988.</dd><dt class="label"><a name="ABK" id="ABK" />Brüggemann-Klein</dt><dd>Brüggemann-Klein, | 1325 | matches the production labeled <a href="#NT-content">content</a>. All internal |
1326 | Anne. <a href="ftp://ftp.informatik.uni-freiburg.de/documents/papers/brueggem/habil.ps"><cite>Formal Models in Document Processing</cite></a>. Habilitationsschrift. Faculty | 1326 | parameter entities are well-formed by definition.</p><p>A consequence of well-formedness in general |
1327 | of Mathematics at the University of Freiburg, 1993. (See ftp://ftp.informatik.uni-freiburg.de/documents/papers/brueggem/habil.ps.)</dd><dt class="label"><a name="ABKDW" id="ABKDW" />Brüggemann-Klein and Wood</dt><dd>Brüggemann-Klein, | 1327 | entities is that the logical and physical |
1328 | Anne, and Derick Wood. <cite>Deterministic Regular Languages</cite>. | 1328 | structures in an XML document are properly nested; no <a title="Start-Tag" href="#dt-stag">start-tag</a>, <a title="End Tag" href="#dt-etag">end-tag</a>, <a title="Empty" href="#dt-empty">empty-element tag</a>, <a title="Element" href="#dt-element">element</a>, <a title="Comment" href="#dt-comment">comment</a>, <a title="Processing instruction" href="#dt-pi">processing instruction</a>, <a title="Character Reference" href="#dt-charref">character |
1329 | Universität Freiburg, Institut für Informatik, Bericht 38, Oktober 1991. Extended | 1329 | reference</a>, or <a title="Entity Reference" href="#dt-entref">entity reference</a> |
1330 | abstract in A. Finkel, M. Jantzen, Hrsg., STACS 1992, S. 173-184. Springer-Verlag, | 1330 | can begin in one entity and end in another.</p></div><div class="div3"> <h4><a name="charencoding" id="charencoding"/>4.3.3 Character Encoding in Entities</h4><p>Each external parsed entity in an XML document <span>may</span> use a different encoding |
1331 | Berlin 1992. Lecture Notes in Computer Science 577. Full version titled <cite>One-Unambiguous | 1331 | for its characters. All XML processors <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> be able to read entities in both |
1332 | Regular Languages</cite> in Information and Computation 140 (2): 229-253, | 1332 | the UTF-8 and UTF-16 encodings. The terms "UTF-8" |
1333 | February 1998.</dd><dt class="label"><a name="Charmod" />Charmod</dt><dd>W3C Working Draft. | 1333 | and "UTF-16" in this specification do not apply to character |
1334 | 1334 | encodings with any other labels, even if the encodings or labels are very | |
1335 | <a href="http://www.w3.org/TR/2003/WD-charmod-20030822/"><cite>Character Model for the World Wide Web 1.0</cite></a>. | 1335 | similar to UTF-8 or UTF-16.</p><p>Entities encoded in UTF-16 <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> and entities |
1336 | 1336 | encoded in UTF-8 <em class="rfc2119" title="Keyword in RFC 2119 context">MAY</em> begin with the Byte Order Mark described in | |
1337 | Martin J. Dürst, François Yergeau, Richard Ishida, Misha Wolf, Tex Texin. (See http://www.w3.org/TR/2003/WD-charmod-20030822/.)</dd><dt class="label"><a name="Clark" id="Clark" />Clark</dt><dd>James Clark. | 1337 | ISO/IEC 10646 <a href="#ISO10646">[ISO/IEC 10646]</a> or Unicode <a href="#Unicode">[Unicode]</a> |
1338 | <a href="http://www.w3.org/TR/NOTE-sgml-xml-971215"><cite>Comparison of SGML and XML</cite></a>. (See http://www.w3.org/TR/NOTE-sgml-xml-971215.)</dd><dt class="label"><a name="IANA-LANGCODES" id="IANA-LANGCODES" />IANA-LANGCODES</dt><dd>(Internet | 1338 | (the ZERO WIDTH NO-BREAK SPACE character, #xFEFF). This is an encoding signature, |
1339 | Assigned Numbers Authority) <a href="http://www.iana.org/assignments/language-tags"><cite>Registry of Language Tags</cite></a>, | 1339 | not part of either the markup or the character data of the XML document. XML |
1340 | ed. Keld Simonsen et al. (See http://www.iana.org/assignments/language-tags.)</dd><dt class="label"><a name="RFC2141" id="RFC2141" />IETF RFC 2141</dt><dd>IETF | 1340 | processors <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> be able to use this character to differentiate between UTF-8 |
1341 | (Internet Engineering Task Force). <a href="http://www.ietf.org/rfc/rfc2141.txt"><cite>RFC 2141: URN Syntax</cite></a>, ed. | 1341 | and UTF-16 encoded documents.</p><p>Although an XML processor is required to read only entities in the UTF-8 |
1342 | R. Moats. 1997. (See http://www.ietf.org/rfc/rfc2141.txt.)</dd><dt class="label"><a name="rfc2376" id="rfc2376" />IETF RFC 3023</dt><dd>IETF | 1342 | and UTF-16 encodings, it is recognized that other encodings are used around |
1343 | (Internet Engineering Task Force). <a href="http://www.ietf.org/rfc/rfc3023.txt"><cite>RFC 3023: XML Media Types</cite></a>. | 1343 | the world, and it may be desired for XML processors to read entities that |
1344 | eds. M. Murata, S. St.Laurent, D. Kohn. 2001. (See http://www.ietf.org/rfc/rfc3023.txt.)</dd><dt class="label"><a name="rfc2781" id="rfc2781" />IETF RFC 2781</dt><dd>IETF | 1344 | use them. In |
1345 | (Internet Engineering Task Force). <a href="http://www.ietf.org/rfc/rfc2781.txt"><cite>RFC 2781: UTF-16, an encoding | 1345 | the absence of external character encoding information (such as MIME headers), |
1346 | of ISO 10646</cite></a>, ed. P. Hoffman, F. Yergeau. 2000. (See http://www.ietf.org/rfc/rfc2781.txt.)</dd><dt class="label"><a name="ISO639" id="ISO639" />ISO 639</dt><dd>(International Organization for Standardization). | 1346 | parsed entities which are stored in an encoding other than UTF-8 or UTF-16 |
1347 | <cite>ISO 639:1988 (E). | 1347 | <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> begin with a text declaration (see <a href="#sec-TextDecl"><b>4.3.1 The Text Declaration</b></a>) containing |
1348 | Code for the representation of names of languages.</cite> [Geneva]: International | 1348 | an encoding declaration:</p> <h5><a name="IDABKLS" id="IDABKLS"/>Encoding Declaration</h5><table class="scrap" summary="Scrap"><tbody><tr valign="baseline"><td><a name="NT-EncodingDecl" id="NT-EncodingDecl"/>[80] </td><td><code>EncodingDecl</code></td><td> ::= </td><td><code> |
1349 | Organization for Standardization, 1988.</dd><dt class="label"><a name="ISO3166" id="ISO3166" />ISO 3166</dt><dd>(International Organization for Standardization). | 1349 | <a href="#NT-S">S</a> 'encoding' <a href="#NT-Eq">Eq</a> |
1350 | <cite>ISO 3166-1:1997 | 1350 | ('"' <a href="#NT-EncName">EncName</a> '"' | "'" <a href="#NT-EncName">EncName</a> |
1351 | (E). Codes for the representation of names of countries and their subdivisions — | 1351 | "'" ) </code></td></tr></tbody><tbody><tr valign="baseline"><td><a name="NT-EncName" id="NT-EncName"/>[81] </td><td><code>EncName</code></td><td> ::= </td><td><code>[A-Za-z] ([A-Za-z0-9._] | '-')*</code></td><td><i>/* Encoding |
1352 | Part 1: Country codes</cite> [Geneva]: International Organization for | 1352 | name contains only Latin characters */</i></td></tr></tbody></table><p>In the <a title="Document Entity" href="#dt-docent">document entity</a>, the encoding |
1353 | Standardization, 1997.</dd><dt class="label"><a name="ISO8879" id="ISO8879" />ISO 8879</dt><dd>ISO (International Organization for Standardization). <cite>ISO | 1353 | declaration is part of the <a title="XML Declaration" href="#dt-xmldecl">XML declaration</a>. |
1354 | 8879:1986(E). Information processing — Text and Office Systems — | 1354 | The <a href="#NT-EncName">EncName</a> is the name of the encoding used.</p><p>In an encoding declaration, the values "<code>UTF-8</code>", "<code>UTF-16</code>", |
1355 | Standard Generalized Markup Language (SGML).</cite> First edition — | 1355 | "<code>ISO-10646-UCS-2</code>", and "<code>ISO-10646-UCS-4</code>" |
1356 | 1986-10-15. [Geneva]: International Organization for Standardization, 1986. </dd><dt class="label"><a name="ISO10744" id="ISO10744" />ISO/IEC 10744</dt><dd>ISO (International Organization for | 1356 | <em class="rfc2119" title="Keyword in RFC 2119 context">SHOULD</em> be used |
1357 | Standardization). <cite>ISO/IEC 10744-1992 (E). Information technology — | 1357 | for the various encodings and transformations of Unicode / ISO/IEC 10646, |
1358 | Hypermedia/Time-based Structuring Language (HyTime). </cite> [Geneva]: | 1358 | the values "<code>ISO-8859-1</code>", "<code>ISO-8859-2</code>", |
1359 | International Organization for Standardization, 1992. <em>Extended Facilities | 1359 | ... "<code>ISO-8859-</code><var>n</var>" (where <var>n</var> |
1360 | Annexe.</em> [Geneva]: International Organization for Standardization, 1996. </dd><dt class="label"><a name="websgml" id="websgml" />WEBSGML</dt><dd>ISO | 1360 | is the part number) <em class="rfc2119" title="Keyword in RFC 2119 context">SHOULD</em> be used for the parts of ISO 8859, and |
1361 | (International Organization for Standardization). <a href="http://www.sgmlsource.com/8879/n0029.htm"><cite>ISO 8879:1986 | 1361 | the values "<code>ISO-2022-JP</code>", "<code>Shift_JIS</code>", |
1362 | TC2. Information technology — Document Description and Processing Languages</cite></a>. | 1362 | and "<code>EUC-JP</code>" |
1363 | [Geneva]: International Organization for Standardization, 1998. (See http://www.sgmlsource.com/8879/n0029.htm.)</dd><dt class="label"><a name="xml-names" id="xml-names" />XML Names</dt><dd>Tim Bray, | 1363 | <em class="rfc2119" title="Keyword in RFC 2119 context">SHOULD</em> be used for the various encoded |
1364 | Dave Hollander, and Andrew Layman, editors. <a href="http://www.w3.org/TR/REC-xml-names/"><cite>Namespaces in XML</cite></a>. | 1364 | forms of JIS X-0208-1997. It |
1365 | Textuality, Hewlett-Packard, and Microsoft. World Wide Web Consortium, 1999. (See http://www.w3.org/TR/REC-xml-names/.)</dd></dl></div></div><div class="div1"> <h2><a name="sec-CharNorm" id="sec-CharNorm" />B Definitions for Character Normalization</h2><p>This appendix contains the necessary definitions for character normalization. | 1365 | is <em class="rfc2119" title="Keyword in RFC 2119 context">RECOMMENDED</em> that character encodings registered (as <em>charset</em>s) |
1366 | For additional background information and examples, see <a href="#Charmod">[Charmod]</a>.</p><p> | 1366 | with the Internet Assigned Numbers Authority <a href="#IANA">[IANA-CHARSETS]</a>, |
1367 | [<a name="dt-Uni-encform" id="dt-Uni-encform" title="Unicode encoding form">Definition</a>: Text is said to be | 1367 | other than those just listed, be referred to using their registered names; |
1368 | in a <b>Unicode encoding form</b> if it is encoded in | 1368 | other encodings <em class="rfc2119" title="Keyword in RFC 2119 context">SHOULD</em> use names starting with an "x-" prefix. |
1369 | UTF-8, UTF-16 or UTF-32.]</p><p> | 1369 | XML processors <em class="rfc2119" title="Keyword in RFC 2119 context">SHOULD</em> match character encoding names in a case-insensitive |
1370 | [<a name="dt-legacyenc" id="dt-legacyenc" title="legacy encoding">Definition</a>: <b>Legacy encoding</b> | 1370 | way and <em class="rfc2119" title="Keyword in RFC 2119 context">SHOULD</em> either interpret an IANA-registered name as the encoding registered |
1371 | is taken to mean any character encoding not based on Unicode.]</p><p> | 1371 | at IANA for that name or treat it as unknown (processors are, of course, not |
1372 | [<a name="dt-normtransc" id="dt-normtransc" title="normalizing transcoder">Definition</a>: A | 1372 | required to support all IANA-registered encodings).</p><p>In the absence of information provided by an external transport protocol |
1373 | <b>normalizing transcoder</b> is a transcoder that converts from a | 1373 | (e.g. HTTP or MIME), it is a <a title="Fatal Error" href="#dt-fatal">fatal error</a> for |
1374 | <a title="legacy encoding" href="#dt-legacyenc">legacy encoding</a> to a | 1374 | an entity including an encoding declaration to be presented to the XML processor |
1375 | <a title="Unicode encoding form" href="#dt-Uni-encform">Unicode encoding form</a> and | 1375 | in an encoding other than that named in the declaration, or for an entity which |
1376 | ensures that the result is in Unicode Normalization Form C | 1376 | begins with neither a Byte Order Mark |
1377 | (see UAX #15 <a href="#Unicode">[Unicode]</a>).]</p><p>[<a name="dt-charesc" id="dt-charesc" title="character escape">Definition</a>: A <b>character escape</b> | 1377 | nor an encoding declaration to use an encoding other than UTF-8. Note that |
1378 | is a syntactic device defined in a markup or programming language that allows | 1378 | since ASCII is a subset of UTF-8, ordinary ASCII entities do not strictly |
1379 | one or more of:]</p><ol type="1"><li><p>expressing syntax-significant characters while disregarding | 1379 | need an encoding declaration.</p><p>It is a <a title="Fatal Error" href="#dt-fatal">fatal error</a> for a <a href="#NT-TextDecl">TextDecl</a> to occur other |
1380 | their significance in the syntax of the language, or</p></li><li><p>expressing characters not representable in the character encoding | 1380 | than at the beginning of an external entity.</p><p>It is a <a title="Fatal Error" href="#dt-fatal">fatal error</a> when an XML processor |
1381 | chosen for an instance of the language, or</p></li><li><p>expressing characters in general, without use of the corresponding | 1381 | encounters an entity with an encoding that it is unable to process. It |
1382 | character codes.</p></li></ol><p> | 1382 | is a <a title="Fatal Error" href="#dt-fatal">fatal error</a> if an XML entity is determined (via default, encoding declaration, |
1383 | [<a name="dt-certified" id="dt-certified" title="certified">Definition</a>: <b>Certified</b> text | 1383 | or higher-level protocol) to be in a certain encoding but contains byte |
1384 | is text which satisfies at least one of the following conditions:]</p><ol type="1"><li><p>it has been confirmed through inspection that the text | 1384 | sequences that are not legal in that encoding. Specifically, it is a |
1385 | is in normalized form</p></li><li><p>the source text-processing component is identified | 1385 | fatal error if an entity encoded in UTF-8 contains any irregular code unit sequences, |
1386 | and is known to produce only normalized text.</p></li></ol><p> | 1386 | as defined in Unicode <a href="#Unicode">[Unicode]</a>. Unless an encoding |
1387 | [<a name="dt-uninorm" id="dt-uninorm" title="Unicode-normalized">Definition</a>: Text is, for the purposes of | 1387 | is determined by a higher-level protocol, it is also a <a title="Fatal Error" href="#dt-fatal">fatal error</a> if an XML entity |
1388 | this specification, <b>Unicode-normalized</b> if it is in a | 1388 | contains no encoding declaration and its content is not legal UTF-8 or UTF-16.</p><p>Examples of text declarations containing encoding declarations:</p><div class="exampleInner"><pre><?xml encoding='UTF-8'?> |
1389 | <a title="Unicode encoding form" href="#dt-Uni-encform">Unicode encoding form</a> and is in | 1389 | <?xml encoding='EUC-JP'?></pre></div></div><div class="div3"> <h4><a name="sec-version-info" id="sec-version-info"/>4.3.4 Version Information in Entities</h4><p>Each entity, including the <a title="Document Entity" href="#dt-docent">document entity</a>, |
1390 | Unicode Normalization Form C, according to a version of Unicode Standard Annex #15: | 1390 | can be separately |
1391 | Unicode Normalization Forms <a href="#Unicode">[Unicode]</a> at least as recent as the | 1391 | declared as XML 1.0 or XML 1.1. The version declaration appearing |
1392 | oldest version of the Unicode Standard that contains all the characters | 1392 | in the document entity determines the version of the document as a |
1393 | actually present in the text, but no earlier | 1393 | whole. An XML 1.1 document may invoke XML 1.0 external entities, so |
1394 | than version 3.2.]</p><p> | 1394 | that otherwise duplicated versions of external entities, |
1395 | [<a name="dt-inclnorm" id="dt-inclnorm" title="include-normalized">Definition</a>: Text is | 1395 | particularly DTD external subsets, need not be maintained. However, |
1396 | <b>include-normalized</b> if:]</p><ol type="1"><li><p>the text is <a title="Unicode-normalized" href="#dt-uninorm">Unicode-normalized</a> | 1396 | in such a case the rules of XML 1.1 are applied to the entire |
1397 | and does not contain any <a title="character escape" href="#dt-charesc">character escapes</a> | 1397 | document.</p><p> If an entity (including the document entity) is not labeled |
1398 | or <a title="Include" href="#dt-include">includes</a> whose expansion would | 1398 | with a version number, it is treated as if labeled as version |
1399 | cause the text to become no longer <a title="Unicode-normalized" href="#dt-uninorm">Unicode-normalized</a>; | 1399 | 1.0.</p></div></div><div class="div2"> <h3><a name="entproc" id="entproc"/>4.4 XML Processor Treatment of Entities and References</h3><p>The table below summarizes the contexts in which character references, |
1400 | or</p></li><li><p>the text is in a <a title="legacy encoding" href="#dt-legacyenc">legacy encoding</a> and, if it were transcoded | 1400 | entity references, and invocations of unparsed entities might appear and the |
1401 | to a <a title="Unicode encoding form" href="#dt-Uni-encform">Unicode encoding form</a> by a | 1401 | <em class="rfc2119" title="Keyword in RFC 2119 context">REQUIRED</em> behavior of an <a title="XML Processor" href="#dt-xml-proc">XML processor</a> |
1402 | <a title="normalizing transcoder" href="#dt-normtransc">normalizing transcoder</a>, the resulting | 1402 | in each case. The labels in the leftmost column describe the recognition context: </p><dl><dt class="label">Reference in Content</dt><dd><p>as a reference anywhere after the <a title="Start-Tag" href="#dt-stag">start-tag</a> |
1403 | text would satisfy clause 1 above.</p></li></ol><p> | 1403 | and before the <a title="End Tag" href="#dt-etag">end-tag</a> of an element; corresponds |
1404 | [<a name="dt-compchar" id="dt-compchar" title="composing character">Definition</a>: A <b>composing character</b> | 1404 | to the nonterminal <a href="#NT-content">content</a>.</p></dd><dt class="label">Reference in Attribute Value</dt><dd><p>as a reference within either the value of an attribute in a <a title="Start-Tag" href="#dt-stag">start-tag</a>, |
1405 | is a character that is one or both of the following:]</p><ol type="1"><li><p>the second character in the canonical decomposition mapping of | 1405 | or a default value in an <a title="Attribute-List Declaration" href="#dt-attdecl">attribute declaration</a>; |
1406 | some primary composite (as defined in D3 of UAX #15 <a href="#Unicode">[Unicode]</a>), or</p></li><li><p>of non-zero canonical combining class (as defined in Unicode | 1406 | corresponds to the nonterminal <a href="#NT-AttValue">AttValue</a>.</p></dd><dt class="label">Occurs as Attribute Value</dt><dd><p>as a <a href="#NT-Name">Name</a>, not a reference, appearing either as |
1407 | <a href="#Unicode">[Unicode]</a>).</p></li></ol><p> | 1407 | the value of an attribute which has been declared as type <b>ENTITY</b>, |
1408 | [<a name="dt-fullnorm" id="dt-fullnorm" title="fully normalized">Definition</a>: Text is | 1408 | or as one of the space-separated tokens in the value of an attribute which |
1409 | <b>fully-normalized</b> if:]</p><ol type="1"><li><p>the text is in a <a title="Unicode encoding form" href="#dt-Uni-encform">Unicode encoding | 1409 | has been declared as type <b>ENTITIES</b>.</p></dd><dt class="label">Reference in Entity Value</dt><dd><p>as a reference within a parameter or internal entity's <a title="Literal Entity Value" href="#dt-litentval">literal |
1410 | form</a>, is <a title="include-normalized" href="#dt-inclnorm">include-normalized</a> and | 1410 | entity value</a> in the entity's declaration; corresponds to the nonterminal <a href="#NT-EntityValue">EntityValue</a>.</p></dd><dt class="label">Reference in DTD</dt><dd><p>as a reference within either the internal or external subsets of the <a title="Document Type Declaration" href="#dt-doctype">DTD</a>, but outside of an <a href="#NT-EntityValue">EntityValue</a>, <a href="#NT-AttValue">AttValue</a>, <a href="#NT-PI">PI</a>, <a href="#NT-Comment">Comment</a>, <a href="#NT-SystemLiteral">SystemLiteral</a>, <a href="#NT-PubidLiteral">PubidLiteral</a>, |
1411 | none of the <a title="" href="#dt-relconst"><span>relevant</span> | 1411 | or the contents of an ignored conditional section (see <a href="#sec-condition-sect"><b>3.4 Conditional Sections</b></a>).</p><p>.</p></dd></dl><p> |
1412 | constructs</a> comprising the text begin with a | 1412 | </p><table border="1" frame="border" cellpadding="7" summary="Entity type/reference matrix"><tbody align="center"><tr><td rowspan="2"></td><td colspan="4" align="center" valign="bottom">Entity |
1413 | <a title="composing character" href="#dt-compchar">composing character</a> or a | 1413 | Type</td><td rowspan="2" align="center">Character</td></tr><tr align="center" valign="bottom"><td>Parameter</td><td>Internal General</td><td>External Parsed |
1414 | character escape representing a | 1414 | General</td><td>Unparsed</td></tr><tr align="center" valign="middle"><td align="right">Reference |
1415 | <a title="composing character" href="#dt-compchar">composing character</a>; or</p></li><li><p>the text is in a <a title="legacy encoding" href="#dt-legacyenc">legacy encoding</a> and, | 1415 | in Content</td><td> |
1416 | if it were transcoded to a <a title="Unicode encoding form" href="#dt-Uni-encform">Unicode encoding form</a> | 1416 | <a href="#not-recognized"><cite>Not recognized</cite></a> |
1417 | by a <a title="normalizing transcoder" href="#dt-normtransc">normalizing transcoder</a>, the resulting text | 1417 | </td><td> |
1418 | would satisfy clause 1 above.</p></li></ol></div><div class="div1"> <h2><a name="sec-entexpand" id="sec-entexpand" />C Expansion of Entity and Character References (Non-Normative)</h2><p>This appendix contains some examples illustrating the sequence of entity- | 1418 | <a href="#included"><cite>Included</cite></a> |
1419 | and character-reference recognition and expansion, as specified in <a href="#entproc"><b>4.4 XML Processor Treatment of Entities and References</b></a>.</p><p>If the DTD contains the declaration</p><div class="exampleInner"><pre><!ENTITY example "<p>An ampersand (&#38;#38;) may be escaped | 1419 | </td><td> |
1420 | numerically (&#38;#38;#38;) or with a general entity | 1420 | <a href="#include-if-valid"><cite>Included |
1421 | (&amp;amp;).</p>" ></pre></div><p>then the XML processor will recognize the character references when it | 1421 | if validating</cite></a> |
1422 | parses the entity declaration, and resolve them before storing the following | 1422 | </td><td> |
1423 | string as the value of the entity "<code>example</code>":</p><div class="exampleInner"><pre><p>An ampersand (&#38;) may be escaped | 1423 | <a href="#forbidden"><cite>Forbidden</cite></a> |
1424 | numerically (&#38;#38;) or with a general entity | 1424 | </td><td> |
1425 | (&amp;amp;).</p></pre></div><p>A reference in the document to "<code>&example;</code>" | 1425 | <a href="#included"><cite>Included</cite></a> |
1426 | will cause the text to be reparsed, at which time the start- and end-tags | 1426 | </td></tr><tr align="center" valign="middle"><td align="right">Reference in Attribute Value</td><td> |
1427 | of the <code>p</code> element will be recognized and the three references will | 1427 | <a href="#not-recognized"><cite>Not recognized</cite></a> |
1428 | be recognized and expanded, resulting in a <code>p</code> element with the following | 1428 | </td><td> |
1429 | content (all data, no delimiters or markup):</p><div class="exampleInner"><pre>An ampersand (&) may be escaped | 1429 | <a href="#inliteral"><cite>Included |
1430 | numerically (&#38;) or with a general entity | 1430 | in literal</cite></a> |
1431 | (&amp;).</pre></div><p>A more complex example will illustrate the rules and their effects fully. | 1431 | </td><td> |
1432 | In the following example, the line numbers are solely for reference.</p><div class="exampleInner"><pre>1 <?xml version='1.0'?> | 1432 | <a href="#forbidden"><cite>Forbidden</cite></a> |
1433 | 2 <!DOCTYPE test [ | 1433 | </td><td> |
1434 | 3 <!ELEMENT test (#PCDATA) > | 1434 | <a href="#forbidden"><cite>Forbidden</cite></a> |
1435 | 4 <!ENTITY % xx '&#37;zz;'> | 1435 | </td><td> |
1436 | 5 <!ENTITY % zz '&#60;!ENTITY tricky "error-prone" >' > | 1436 | <a href="#included"><cite>Included</cite></a> |
1437 | 6 %xx; | 1437 | </td></tr><tr align="center" valign="middle"><td align="right">Occurs as Attribute |
1438 | 7 ]> | 1438 | Value</td><td> |
1439 | 8 <test>This sample shows a &tricky; method.</test></pre></div><p>This produces the following:</p><ul><li><p>in line 4, the reference to character 37 is expanded immediately, | 1439 | <a href="#not-recognized"><cite>Not recognized</cite></a> |
1440 | and the parameter entity "<code>xx</code>" is stored in the symbol | 1440 | </td><td> |
1441 | table with the value "<code>%zz;</code>". Since the replacement | 1441 | <a href="#forbidden"><cite>Forbidden</cite></a> |
1442 | text is not rescanned, the reference to parameter entity "<code>zz</code>" | 1442 | </td><td> |
1443 | is not recognized. (And it would be an error if it were, since "<code>zz</code>" | 1443 | <a href="#forbidden"><cite>Forbidden</cite></a> |
1444 | is not yet declared.)</p></li><li><p>in line 5, the character reference "<code>&#60;</code>" | 1444 | </td><td> |
1445 | is expanded immediately and the parameter entity "<code>zz</code>" | 1445 | <a href="#notify"><cite>Notify</cite></a> |
1446 | is stored with the replacement text "<code><!ENTITY tricky "error-prone" | 1446 | </td><td> |
1447 | ></code>", which is a well-formed entity declaration.</p></li><li><p>in line 6, the reference to "<code>xx</code>" is recognized, | 1447 | <a href="#not-recognized"><cite>Not recognized</cite></a> |
1448 | and the replacement text of "<code>xx</code>" (namely "<code>%zz;</code>") | 1448 | </td></tr><tr align="center" valign="middle"><td align="right">Reference in EntityValue</td><td> |
1449 | is parsed. The reference to "<code>zz</code>" is recognized in | 1449 | <a href="#inliteral"><cite>Included in literal</cite></a> |
1450 | its turn, and its replacement text ("<code><!ENTITY tricky "error-prone" | 1450 | </td><td> |
1451 | ></code>") is parsed. The general entity "<code>tricky</code>" | 1451 | <a href="#bypass"><cite>Bypassed</cite></a> |
1452 | has now been declared, with the replacement text "<code>error-prone</code>".</p></li><li><p>in line 8, the reference to the general entity "<code>tricky</code>" | 1452 | </td><td> |
1453 | is recognized, and it is expanded, so the full content of the <code>test</code> | 1453 | <a href="#bypass"><cite>Bypassed</cite></a> |
1454 | element is the self-describing (and ungrammatical) string <em>This sample | 1454 | </td><td> |
1455 | shows a error-prone method.</em></p></li></ul></div><div class="div1"> <h2><a name="determinism" id="determinism" />D Deterministic Content Models (Non-Normative)</h2><p>As | 1455 | <a href="#error"><cite>Error</cite></a> |
1456 | noted in <a href="#sec-element-content"><b>3.2.1 Element Content</b></a>, it is required that content | 1456 | </td><td> |
1457 | models in element type declarations be deterministic. This requirement is <a title="For Compatibility" href="#dt-compat">for compatibility</a> with SGML (which calls deterministic | 1457 | <a href="#included"><cite>Included</cite></a> |
1458 | content models "unambiguous"); XML processors built | 1458 | </td></tr><tr align="center" valign="middle"><td align="right">Reference in DTD</td><td> |
1459 | using SGML systems may flag non-deterministic content models as errors.</p><p>For example, the content model <code>((b, c) | (b, d))</code> is non-deterministic, | 1459 | <a href="#as-PE"><cite>Included as PE</cite></a> |
1460 | because given an initial <code>b</code> the XML processor | 1460 | </td><td> |
1461 | cannot know which <code>b</code> in the model is being matched without looking | 1461 | <a href="#forbidden"><cite>Forbidden</cite></a> |
1462 | ahead to see which element follows the <code>b</code>. In this case, the two references | 1462 | </td><td> |
1463 | to <code>b</code> can be collapsed into a single reference, making the model read <code>(b, | 1463 | <a href="#forbidden"><cite>Forbidden</cite></a> |
1464 | (c | d))</code>. An initial <code>b</code> now clearly matches only a single name | 1464 | </td><td> |
1465 | in the content model. The processor doesn't need to look ahead to see what follows; either <code>c</code> or <code>d</code> | 1465 | <a href="#forbidden"><cite>Forbidden</cite></a> |
1466 | would be accepted.</p><p>More formally: a finite state automaton may be constructed from the content | 1466 | </td><td> |
1467 | model using the standard algorithms, e.g. algorithm 3.5 in section 3.9 of | 1467 | <a href="#forbidden"><cite>Forbidden</cite></a> |
1468 | Aho, Sethi, and Ullman <a href="#Aho">[Aho/Ullman]</a>. In many such algorithms, a follow | 1468 | </td></tr></tbody></table><div class="div3"> <h4><a name="not-recognized" id="not-recognized"/>4.4.1 Not Recognized</h4><p>Outside the DTD, the <code>%</code> character has no special significance; |
1469 | set is constructed for each position in the regular expression (i.e., each | 1469 | thus, what would be parameter entity references in the DTD are not recognized |
1470 | leaf node in the syntax tree for the regular expression); if any position | 1470 | as markup in <a href="#NT-content">content</a>. Similarly, the names of unparsed |
1471 | has a follow set in which more than one following position is labeled with | 1471 | entities are not recognized except when they appear in the value of an appropriately |
1472 | the same element type name, then the content model is in error and may be | 1472 | declared attribute.</p></div><div class="div3"> <h4><a name="included" id="included"/>4.4.2 Included</h4><p> |
1473 | reported as an error.</p><p>Algorithms exist which allow many but not all non-deterministic content | 1473 | [<a name="dt-include" id="dt-include" title="Include">Definition</a>: An entity is <b>included</b> |
1474 | models to be reduced automatically to equivalent deterministic models; see | 1474 | when its <a title="Replacement Text" href="#dt-repltext">replacement text</a> is retrieved |
1475 | Brüggemann-Klein 1991 <a href="#ABK">[Brüggemann-Klein]</a>.</p></div><div class="div1"> <h2><a name="sec-guessing" id="sec-guessing" />E Autodetection of Character Encodings (Non-Normative)</h2><p>The XML encoding declaration functions as an internal label on each entity, | 1475 | and processed, in place of the reference itself, as though it were part of |
1476 | indicating which character encoding is in use. Before an XML processor can | 1476 | the document at the location the reference was recognized.] The replacement |
1477 | read the internal label, however, it apparently has to know what character | 1477 | text <span>may</span> contain both <a title="Character Data" href="#dt-chardata">character data</a> |
1478 | encoding is in use — which is what the internal label is trying to indicate. | 1478 | and (except for parameter entities) <a title="Markup" href="#dt-markup">markup</a>, |
1479 | In the general case, this is a hopeless situation. It is not entirely hopeless | 1479 | which <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> be recognized in the usual way. (The string "<code>AT&amp;T;</code>" |
1480 | in XML, however, because XML limits the general case in two ways: each implementation | 1480 | expands to "<code>AT&T;</code>" and the remaining ampersand |
1481 | is assumed to support only a finite set of character encodings, and the XML | 1481 | is not recognized as an entity-reference delimiter.) A character reference |
1482 | encoding declaration is restricted in position and content in order to make | 1482 | is <b>included</b> when the indicated character is processed in place |
1483 | it feasible to autodetect the character encoding in use in each entity in | 1483 | of the reference itself. </p></div><div class="div3"> <h4><a name="include-if-valid" id="include-if-valid"/>4.4.3 Included If Validating</h4><p>When an XML processor recognizes a reference to a parsed entity, in order |
1484 | normal cases. Also, in many cases other sources of information are available | 1484 | to <a title="Validity" href="#dt-valid">validate</a> the document, the processor |
1485 | in addition to the XML data stream itself. Two cases may be distinguished, | 1485 | <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> |
1486 | depending on whether the XML entity is presented to the processor without, | 1486 | <a title="Include" href="#dt-include">include</a> its replacement text. If |
1487 | or with, any accompanying (external) information. We consider the first case | 1487 | the entity is external, and the processor is not attempting to validate the |
1488 | first.</p><div class="div2"> <h3><a name="sec-guessing-no-ext-info" id="sec-guessing-no-ext-info" />E.1 Detection Without External Encoding Information</h3><p>Because each XML entity not accompanied by external | 1488 | XML document, the processor <em class="rfc2119" title="Keyword in RFC 2119 context">MAY</em>, but need |
1489 | encoding information and not in UTF-8 or UTF-16 encoding must | 1489 | not, include the entity's replacement text. If a non-validating processor |
1490 | begin with an XML encoding declaration, in which the first characters must | 1490 | does not include the replacement text, it <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> inform the application that |
1491 | be '<code><?xml</code>', any conforming processor can detect, after two | 1491 | it recognized, but did not read, the entity.</p><p>This rule is based on the recognition that the automatic inclusion provided |
1492 | to four octets of input, which of the following cases apply. In reading this | 1492 | by the SGML and XML entity mechanism, primarily designed to support modularity |
1493 | list, it may help to know that in UCS-4, '<' is "<code>#x0000003C</code>" | 1493 | in authoring, is not necessarily appropriate for other applications, in particular |
1494 | and '?' is "<code>#x0000003F</code>", and the Byte Order Mark | 1494 | document browsing. Browsers, for example, when encountering an external parsed |
1495 | required of UTF-16 data streams is "<code>#xFEFF</code>". The notation | 1495 | entity reference, might choose to provide a visual indication of the entity's |
1496 | <var>##</var> is used to denote any byte value except that two consecutive | 1496 | presence and retrieve it for display only on demand.</p></div><div class="div3"> <h4><a name="forbidden" id="forbidden"/>4.4.4 Forbidden</h4><p>The following are forbidden, and constitute <a title="Fatal Error" href="#dt-fatal">fatal |
1497 | <var>##</var>s cannot be both 00.</p><p>With a Byte Order Mark:</p><table border="1" frame="border" summary="Encoding detection summary"><tbody><tr><td rowspan="1" colspan="1"><code>00 00 FE | 1497 | errors</a>:</p><ul><li><p>the appearance of a reference to an <a title="Unparsed Entity" href="#dt-unparsed">unparsed |
1498 | FF</code></td><td rowspan="1" colspan="1">UCS-4, big-endian machine (1234 order)</td></tr><tr><td rowspan="1" colspan="1"><code>FF | 1498 | entity</a>, except in the |
1499 | FE 00 00</code></td><td rowspan="1" colspan="1">UCS-4, little-endian machine (4321 order)</td></tr><tr><td rowspan="1" colspan="1"><code>00 00 FF FE</code></td><td rowspan="1" colspan="1">UCS-4, unusual octet order (2143)</td></tr><tr><td rowspan="1" colspan="1"><code>FE FF 00 00</code></td><td rowspan="1" colspan="1">UCS-4, unusual octet order (3412)</td></tr><tr><td rowspan="1" colspan="1"><code>FE FF ## ##</code></td><td rowspan="1" colspan="1">UTF-16, big-endian</td></tr><tr><td rowspan="1" colspan="1"><code>FF FE ## ##</code></td><td rowspan="1" colspan="1">UTF-16, little-endian</td></tr><tr><td rowspan="1" colspan="1"><code>EF BB BF</code></td><td rowspan="1" colspan="1">UTF-8</td></tr></tbody></table><p>Without a Byte Order Mark:</p><table border="1" frame="border" summary="Encoding detection summary"><tbody><tr><td rowspan="1" colspan="1"><code>00 00 00 3C</code></td><td rowspan="4" colspan="1">UCS-4 or other encoding with a 32-bit code unit and ASCII | 1499 | <a href="#NT-EntityValue">EntityValue</a> in an entity declaration.</p></li><li><p>the appearance of any character or general-entity reference in the |
1500 | characters encoded as ASCII values, in respectively big-endian (1234), little-endian | 1500 | DTD except within an <a href="#NT-EntityValue">EntityValue</a> or <a href="#NT-AttValue">AttValue</a>.</p></li><li><p>a reference to an external entity in an attribute value.</p></li></ul></div><div class="div3"> <h4><a name="inliteral" id="inliteral"/>4.4.5 Included in Literal</h4><p>When an <a title="Entity Reference" href="#dt-entref">entity reference</a> appears in |
1501 | (4321) and two unusual byte orders (2143 and 3412). The encoding declaration | 1501 | an attribute value, or a parameter entity reference appears in a literal entity |
1502 | must be read to determine which of UCS-4 or other supported 32-bit encodings | 1502 | value, its <a title="Replacement Text" href="#dt-repltext">replacement text</a> |
1503 | applies.</td></tr><tr><td rowspan="1" colspan="1"><code>3C 00 00 00</code></td></tr><tr><td rowspan="1" colspan="1"><code>00 00 3C 00</code></td></tr><tr><td rowspan="1" colspan="1"><code>00 3C 00 00</code></td></tr><tr><td rowspan="1" colspan="1"><code>00 3C 00 3F</code></td><td rowspan="1" colspan="1">UTF-16BE or big-endian ISO-10646-UCS-2 | 1503 | <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> be processed |
1504 | or other encoding with a 16-bit code unit in big-endian order and ASCII characters | 1504 | in place of the reference itself as though it were part of the document at |
1505 | encoded as ASCII values (the encoding declaration must be read to determine | 1505 | the location the reference was recognized, except that a single or double |
1506 | which)</td></tr><tr><td rowspan="1" colspan="1"><code>3C 00 3F 00</code></td><td rowspan="1" colspan="1">UTF-16LE or little-endian | 1506 | quote character in the replacement text <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> always be treated as a normal data |
1507 | ISO-10646-UCS-2 or other encoding with a 16-bit code unit in little-endian | 1507 | character and <em class="rfc2119" title="Keyword in RFC 2119 context">MUST NOT</em> terminate the literal. For example, this is well-formed:</p><div class="exampleInner"><pre><!ENTITY % YN '"Yes"' > |
1508 | order and ASCII characters encoded as ASCII values (the encoding declaration | 1508 | <!ENTITY WhatHeSaid "He said %YN;" ></pre></div><p>while this is not:</p><div class="exampleInner"><pre><!ENTITY EndAttr "27'" > |
1509 | must be read to determine which)</td></tr><tr><td rowspan="1" colspan="1"><code>3C 3F 78 6D</code></td><td rowspan="1" colspan="1">UTF-8, ISO 646, ASCII, some part of ISO 8859, Shift-JIS, EUC, or any other | 1509 | <element attribute='a-&EndAttr;></pre></div></div><div class="div3"> <h4><a name="notify" id="notify"/>4.4.6 Notify</h4><p>When the name of an <a title="Unparsed Entity" href="#dt-unparsed">unparsed entity</a> |
1510 | 7-bit, 8-bit, or mixed-width encoding which ensures that the characters of | 1510 | appears as a token in the value of an attribute of declared type <b>ENTITY</b> |
1511 | ASCII have their normal positions, width, and values; the actual encoding | 1511 | or <b>ENTITIES</b>, a validating processor <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> inform the application of |
1512 | declaration must be read to detect which of these applies, but since all of | 1512 | the <a title="System Identifier" href="#dt-sysid">system</a> and <a title="Public identifier" href="#dt-pubid">public</a> |
1513 | these encodings use the same bit patterns for the relevant ASCII characters, | 1513 | (if any) identifiers for both the entity and its associated <a title="Notation" href="#dt-notation">notation</a>.</p></div><div class="div3"> <h4><a name="bypass" id="bypass"/>4.4.7 Bypassed</h4><p>When a general entity reference appears in the <a href="#NT-EntityValue">EntityValue</a> |
1514 | the encoding declaration itself may be read reliably</td></tr><tr><td rowspan="1" colspan="1"><code>4C | 1514 | in an entity declaration, it <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> be bypassed and left as is.</p></div><div class="div3"> <h4><a name="as-PE" id="as-PE"/>4.4.8 Included as PE</h4><p>Just as with external parsed entities, parameter entities need only be <a href="#include-if-valid"><cite>included if validating</cite></a>. When a parameter-entity |
1515 | 6F A7 94</code></td><td rowspan="1" colspan="1">EBCDIC (in some flavor; the full encoding declaration | 1515 | reference is recognized in the DTD and included, its <a title="Replacement Text" href="#dt-repltext">replacement |
1516 | must be read to tell which code page is in use)</td></tr><tr><td rowspan="1" colspan="1">Other</td><td rowspan="1" colspan="1">UTF-8 without an encoding declaration, or else the data stream is mislabeled | 1516 | text</a> |
1517 | (lacking a required encoding declaration), corrupt, fragmentary, or enclosed | 1517 | <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> be enlarged by the attachment of one leading and one following |
1518 | in a wrapper of some kind</td></tr></tbody></table><div class="note"><p class="prefix"><b>Note:</b></p><p>In cases above which do not require reading the encoding declaration to | 1518 | space (#x20) character; the intent is to constrain the replacement text of |
1519 | determine the encoding, section 4.3.3 still requires that the encoding declaration, | 1519 | parameter entities to contain an integral number of grammatical tokens in |
1520 | if present, be read and that the encoding name be checked to match the actual | 1520 | the DTD. This |
1521 | encoding of the entity. Also, it is possible that new character encodings | 1521 | behavior <em class="rfc2119" title="Keyword in RFC 2119 context">MUST NOT</em> apply to parameter entity references within entity values; |
1522 | will be invented that will make it necessary to use the encoding declaration | 1522 | these are described in <a href="#inliteral"><b>4.4.5 Included in Literal</b></a>.</p></div><div class="div3"> <h4><a name="error" id="error"/>4.4.9 Error</h4><p>It is an <a title="Error" href="#dt-error">error</a> for a reference to |
1523 | to determine the encoding, in cases where this is not required at present.</p></div><p>This level of autodetection is enough to read the XML encoding declaration | 1523 | an unparsed entity to appear in the <a href="#NT-EntityValue">EntityValue</a> in an |
1524 | and parse the character-encoding identifier, which is still necessary to distinguish | 1524 | entity declaration.</p></div></div><div class="div2"> <h3><a name="intern-replacement" id="intern-replacement"/>4.5 Construction of Entity Replacement Text</h3><p>In discussing the treatment of entities, it is useful to distinguish |
1525 | the individual members of each family of encodings (e.g. to tell UTF-8 from | 1525 | two forms of the entity's value. |
1526 | 8859, and the parts of 8859 from each other, or to distinguish the specific | 1526 | [<a name="dt-litentval" id="dt-litentval" title="Literal Entity Value">Definition</a>: For an |
1527 | EBCDIC code page in use, and so on).</p><p>Because the contents of the encoding declaration are restricted to characters | 1527 | internal entity, the <b>literal |
1528 | from the ASCII repertoire (however encoded), | 1528 | entity value</b> is the quoted string actually present in the entity declaration, |
1529 | a processor can reliably read the entire encoding declaration as soon as it | 1529 | corresponding to the non-terminal <a href="#NT-EntityValue">EntityValue</a>.] |
1530 | has detected which family of encodings is in use. Since in practice, all widely | 1530 | [<a name="dt-extlitentval" id="dt-extlitentval" title="Literal Entity Value">Definition</a>: For an external entity, the <b>literal |
1531 | used character encodings fall into one of the categories above, the XML encoding | 1531 | entity value</b> is the exact text contained in the entity.] |
1532 | declaration allows reasonably reliable in-band labeling of character encodings, | 1532 | [<a name="dt-repltext" id="dt-repltext" title="Replacement Text">Definition</a>: For an |
1533 | even when external sources of information at the operating-system or transport-protocol | 1533 | internal entity, the <b>replacement text</b> |
1534 | level are unreliable. Character encodings such as UTF-7 | 1534 | is the content of the entity, after replacement of character references and |
1535 | that make overloaded usage of ASCII-valued bytes may fail to be reliably detected.</p><p>Once the processor has detected the character encoding in use, it can act | 1535 | parameter-entity references.] |
1536 | appropriately, whether by invoking a separate input routine for each case, | 1536 | [<a name="dt-extrepltext" id="dt-extrepltext" title="Replacement Text">Definition</a>: For |
1537 | or by calling the proper conversion function on each character of input.</p><p>Like any self-labeling system, the XML encoding declaration will not work | 1537 | an external entity, the <b>replacement text</b> is the content of the entity, |
1538 | if any software changes the entity's character set or encoding without updating | 1538 | after stripping the text declaration (leaving any surrounding white space) if there |
1539 | the encoding declaration. Implementors of character-encoding routines should | 1539 | is one but without any replacement of character references or parameter-entity |
1540 | be careful to ensure the accuracy of the internal and external information | 1540 | references.] |
1541 | used to label the entity.</p></div><div class="div2"> <h3><a name="sec-guessing-with-ext-info" id="sec-guessing-with-ext-info" />E.2 Priorities in the Presence of External Encoding Information</h3><p>The second possible case occurs when the XML entity is accompanied by encoding | 1541 | </p><p>The literal entity value as given in an internal entity declaration (<a href="#NT-EntityValue">EntityValue</a>) <span>may</span> contain character, parameter-entity, |
1542 | information, as in some file systems and some network protocols. When multiple | 1542 | and general-entity references. Such references <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> be contained entirely |
1543 | sources of information are available, their relative priority and the preferred | 1543 | within the literal entity value. The actual replacement text that is <a title="Include" href="#dt-include">included</a> (or <a title="" href="#inliteral">included in literal</a>) as described above |
1544 | method of handling conflict should be specified as part of the higher-level | 1544 | <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> contain the <em>replacement |
1545 | protocol used to deliver XML. In particular, please refer | 1545 | text</em> of any parameter entities referred to, and <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> contain the character |
1546 | to <a href="#rfc2376">[IETF RFC 3023]</a> or its successor, which defines the <code>text/xml</code> | 1546 | referred to, in place of any character references in the literal entity value; |
1547 | and <code>application/xml</code> MIME types and provides some useful guidance. | 1547 | however, general-entity references <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> be left as-is, unexpanded. For example, |
1548 | In the interests of interoperability, however, the following rule is recommended.</p><ul><li><p>If an XML entity is in a file, the Byte-Order Mark and encoding declaration are used | 1548 | given the following declarations:</p><div class="exampleInner"><pre><!ENTITY % pub "&#xc9;ditions Gallimard" > |
1549 | (if present) to determine the character encoding.</p></li></ul></div></div><div class="div1"> <h2><a name="sec-xml-wg" id="sec-xml-wg" />F W3C XML Working Group (Non-Normative)</h2><p>This specification was prepared and approved for publication by the W3C | 1549 | <!ENTITY rights "All rights reserved" > |
1550 | XML Working Group (WG). WG approval of this specification does not necessarily | 1550 | <!ENTITY book "La Peste: Albert Camus, |
1551 | imply that all WG participants voted for its approval. The current and former members | 1551 | &#xA9; 1947 %pub;. &rights;" ></pre></div><p>then the replacement text for the entity "<code>book</code>" |
1552 | in the XML WG are:</p><ul><li>Jon Bosak, Sun (<i>Chair</i>) </li><li>James Clark (<i>Technical Lead</i>) </li><li>Tim Bray, Textuality and Netscape (<i>XML Co-editor</i>) </li><li>Jean Paoli, Microsoft (<i>XML | 1552 | is:</p><div class="exampleInner"><pre>La Peste: Albert Camus, |
1553 | Co-editor</i>) </li><li>C. M. Sperberg-McQueen, U. of Ill. (<i>XML Co-editor</i>) </li><li>Dan Connolly, W3C (<i>W3C Liaison</i>) </li><li>Paula Angerstein, Texcel</li><li>Steve DeRose, INSO</li><li>Dave Hollander, HP</li><li>Eliot Kimber, ISOGEN</li><li>Eve Maler, ArborText</li><li>Tom Magliery, NCSA</li><li>Murray Maloney, SoftQuad, Grif | 1553 | © 1947 Éditions Gallimard. &rights;</pre></div><p>The general-entity reference "<code>&rights;</code>" would |
1554 | SA, Muzmo and Veo Systems</li><li>MURATA Makoto (FAMILY Given), Fuji | 1554 | be expanded should the reference "<code>&book;</code>" appear |
1555 | Xerox Information Systems</li><li>Joel Nava, Adobe</li><li>Conleth O'Connell, Vignette</li><li>Peter Sharpe, SoftQuad</li><li>John Tigue, DataChannel</li></ul></div><div class="div1"> <h2><a name="sec-core-wg" id="sec-core-wg" />G W3C XML Core <span>Working</span> Group (Non-Normative)</h2><p>The present edition of this specification was prepared by the W3C XML Core | 1555 | in the document's content or an attribute value.</p><p>These simple rules may have complex interactions; for a detailed discussion |
1556 | Working Group (WG). The participants in the WG at the time of publication of this | 1556 | of a difficult example, see <a href="#sec-entexpand"><b>C Expansion of Entity and Character References</b></a>.</p></div><div class="div2"> <h3><a name="sec-predefined-ent" id="sec-predefined-ent"/>4.6 Predefined Entities</h3><p> |
1557 | edition were:</p><ul><li>Leonid Arbouzov, Sun Microsystems</li><li>Mary Brady</li><li>John Cowan (<i>XML 1.1 First Edition Editor</i>) </li><li>John Evdemon, Microsoft</li><li>Andrew Fang, Arbortext</li><li>Paul Grosso, Arbortext (<i>Co-Chair</i>) </li><li>Arnaud Le Hors, IBM</li><li>Dmitry Lenkov, Oracle</li><li>Anjana Manian, Oracle</li><li>Glenn Marcy, IBM</li><li>Jonathan Marsh, Microsoft</li><li>Sandra Martinez, NIST</li><li>Liam Quin, W3C (<i>Staff Contact</i>) </li><li>Lew Shannon</li><li>Richard Tobin, University of Edinburgh</li><li>Daniel Veillard</li><li>Norman Walsh, Sun Microsystems (<i>Co-Chair</i>) </li><li>François Yergeau</li></ul></div><div class="div1"> <h2><a name="prod-notes" id="prod-notes" />H Production Notes (Non-Normative)</h2><p>This edition was encoded in a | 1557 | [<a name="dt-escape" id="dt-escape" title="escape">Definition</a>: Entity and character references <span>may</span> |
1558 | slightly modified version of the | 1558 | both be used to <b>escape</b> the left angle bracket, ampersand, and |
1559 | <a href="http://www.w3.org/2002/xmlspec/dtd/2.5/xmlspec.dtd">XMLspec DTD, 2.5</a>. | 1559 | other delimiters. A set of general entities (<code>amp</code>, |
1560 | The XHTML versions were produced with a combination of the | 1560 | <code>lt</code>, |
1561 | <a href="http://www.w3.org/2002/xmlspec/xhtml/1.9/xmlspec.xsl">xmlspec.xsl</a>, | 1561 | <code>gt</code>, |
1562 | <a href="http://www.w3.org/2002/xmlspec/xhtml/1.9/diffspec.xsl">diffspec.xsl</a>, | 1562 | <code>apos</code>, |
1563 | and <a href="REC-xml-3e.xsl">REC-xml-3e.xsl</a> | 1563 | <code>quot</code>) is specified for |
1564 | XSLT stylesheets.</p></div><div class="div1"> <h2><a name="sec-suggested-names" id="sec-suggested-names" />I Suggestions for XML Names (Non-Normative)</h2><p>The following suggestions define what is believed to be best | 1564 | this purpose. Numeric character references <span>may</span> also be used; they are expanded |
1565 | practice in the construction of XML names used as element names, | 1565 | immediately when recognized and <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> be treated as character data, so the |
1566 | attribute names, processing instruction targets, entity names, | 1566 | numeric character references "<code>&#60;</code>" and "<code>&#38;</code>" <span>may</span> be used to escape <code><</code> and <code>&</code> when they occur |
1567 | notation names, and the values of attributes of type ID, and are | 1567 | in character data.] |
1568 | intended as guidance for document authors and schema designers. | 1568 | </p><p>All XML processors <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> recognize these entities whether they are declared |
1569 | All references to Unicode are understood with respect to | 1569 | or not. <a title="For interoperability" href="#dt-interop">For interoperability</a>, valid XML |
1570 | a particular version of the Unicode Standard greater than or equal | 1570 | documents <em class="rfc2119" title="Keyword in RFC 2119 context">SHOULD</em> declare these entities, like any others, before using them. If |
1571 | to 3.0; which version should be used is left to the discretion of | 1571 | the entities <code>lt</code> or <code>amp</code> are declared, they <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> be |
1572 | the document author or schema designer.</p><p>The first two suggestions are directly derived from the rules | 1572 | declared as internal entities whose replacement text is a character reference |
1573 | given for identifiers in the Unicode Standard, version 3.0, and | 1573 | to the respective |
1574 | exclude all control characters, enclosing nonspacing marks, | 1574 | character (less-than sign or ampersand) being escaped; the double |
1575 | non-decimal numbers, private-use characters, punctuation characters | 1575 | escaping is <em class="rfc2119" title="Keyword in RFC 2119 context">REQUIRED</em> for these entities so that references to them produce |
1576 | (with the noted exceptions), symbol characters, unassigned | 1576 | a well-formed result. If the entities <code>gt</code>, <code>apos</code>, |
1577 | codepoints, and white space characters. The other suggestions | 1577 | or <code>quot</code> are declared, they <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> be declared as internal entities |
1578 | are mostly derived from <a href="#XML1.0">[XML-1.0]</a> Appendix B.</p><ol type="1"><li><p>The first character of any name should have a Unicode General | 1578 | whose replacement text is the single character being escaped (or a character |
1579 | Category of Ll, Lu, Lo, Lm, Lt, or Nl, or else be '_' #x5F.</p></li><li><p>Characters other than the first should have a Unicode General | 1579 | reference to that character; the double escaping here is <em class="rfc2119" title="Keyword in RFC 2119 context">OPTIONAL</em> but harmless). |
1580 | Category of Ll, Lu, Lo, Lm, Lt, Mc, Mn, Nl, Nd, Pc, or Cf, or else | 1580 | For example:</p><div class="exampleInner"><pre><!ENTITY lt "&#38;#60;"> |
1581 | be one of the following: '-' #x2D, '.' #x2E, ':' #x3A or | 1581 | <!ENTITY gt "&#62;"> |
1582 | '·' #xB7 (middle dot). Since Cf characters are not | 1582 | <!ENTITY amp "&#38;#38;"> |
1583 | directly visible, they should be employed with caution and only | 1583 | <!ENTITY apos "&#39;"> |
1584 | when necessary, to avoid creating names which are distinct to XML | 1584 | <!ENTITY quot "&#34;"></pre></div></div><div class="div2"> <h3><a name="Notations" id="Notations"/>4.7 Notation Declarations</h3><p> |
1585 | processors but look the same to human beings.</p></li><li><p>Ideographic characters which have a canonical decomposition | 1585 | [<a name="dt-notation" id="dt-notation" title="Notation">Definition</a>: |
1586 | (including those in the ranges [#xF900-#xFAFF] and | 1586 | <b>Notations</b> identify |
1587 | [#x2F800-#x2FFFD], with 12 exceptions) should not be used in names. | 1587 | by name the format of <a title="Unparsed Entity" href="#dt-unparsed">unparsed entities</a>, |
1588 | </p></li><li><p>Characters which have a compatibility decomposition (those with | 1588 | the format of elements which bear a notation attribute, or the application |
1589 | a "compatibility formatting tag" in field 5 of the Unicode | 1589 | to which a <a title="Processing instruction" href="#dt-pi">processing instruction</a> is addressed.] |
1590 | Character Database -- marked by field 5 beginning with a "<") | 1590 | </p><p> |
1591 | should not be used in names. This suggestion does not apply | 1591 | [<a name="dt-notdecl" id="dt-notdecl" title="Notation Declaration">Definition</a>: |
1592 | to #x0E33 THAI CHARACTER SARA AM or #x0EB3 LAO CHARACTER AM, which | 1592 | <b>Notation declarations</b> |
1593 | despite their compatibility decompositions are in regular use in | 1593 | provide a name for the notation, for use in entity and attribute-list declarations |
1594 | those scripts.</p></li><li><p>Combining characters meant for use with symbols only (including | 1594 | and in attribute specifications, and an external identifier for the notation |
1595 | those in the ranges [#x20D0-#x20EF] and [#x1D165-#x1D1AD]) should | 1595 | which may allow an XML processor or its client application to locate a helper |
1596 | not be used in names.</p></li><li><p>The interlinear annotation characters ([#xFFF9-#xFFFB) should | 1596 | application capable of processing data in the given notation.] |
1597 | not be used in names.</p></li><li><p>Variation selector characters should not be used in names.</p></li><li><p>Names which are nonsensical, unpronounceable, hard to read, or | 1597 | </p> <h5><a name="IDANRMS" id="IDANRMS"/>Notation Declarations</h5><table class="scrap" summary="Scrap"><tbody><tr valign="baseline"><td><a name="NT-NotationDecl" id="NT-NotationDecl"/>[82] </td><td><code>NotationDecl</code></td><td> ::= </td><td><code>'<!NOTATION' <a href="#NT-S">S</a> |
1598 | easily confusable with other names should not be employed.</p></li></ol></div></div></body></html> | 1598 | <a href="#NT-Name">Name</a> |
1599 | <a href="#NT-S">S</a> (<a href="#NT-ExternalID">ExternalID</a> | <a href="#NT-PublicID">PublicID</a>) <a href="#NT-S">S</a>? '>'</code></td><td><a href="#UniqueNotationName">[VC: Unique Notation Name]</a></td></tr></tbody><tbody><tr valign="baseline"><td><a name="NT-PublicID" id="NT-PublicID"/>[83] </td><td><code>PublicID</code></td><td> ::= </td><td><code>'PUBLIC' <a href="#NT-S">S</a> | ||
1600 | <a href="#NT-PubidLiteral">PubidLiteral</a> | ||
1601 | </code></td></tr></tbody></table><div class="constraint"><p class="prefix"><a name="UniqueNotationName" id="UniqueNotationName"/><b>Validity constraint: Unique Notation Name</b></p><p>A given <a href="#NT-Name">Name</a> | ||
1602 | <em class="rfc2119" title="Keyword in RFC 2119 context">MUST NOT</em> be declared in more than one notation declaration.</p></div><p>XML processors <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> provide applications with the name and external identifier(s) | ||
1603 | of any notation declared and referred to in an attribute value, attribute | ||
1604 | definition, or entity declaration. They <em class="rfc2119" title="Keyword in RFC 2119 context">MAY</em> additionally resolve the external | ||
1605 | identifier into the <a title="System Identifier" href="#dt-sysid">system identifier</a>, file | ||
1606 | name, or other information needed to allow the application to call a processor | ||
1607 | for data in the notation described. (It is not an error, however, for XML | ||
1608 | documents to declare and refer to notations for which notation-specific applications | ||
1609 | are not available on the system where the XML processor or application is | ||
1610 | running.)</p></div><div class="div2"> <h3><a name="sec-doc-entity" id="sec-doc-entity"/>4.8 Document Entity</h3><p> | ||
1611 | [<a name="dt-docent" id="dt-docent" title="Document Entity">Definition</a>: The <b>document entity</b> | ||
1612 | serves as the root of the entity tree and a starting-point for an <a title="XML Processor" href="#dt-xml-proc">XML processor</a>.] This specification does | ||
1613 | not specify how the document entity is to be located by an XML processor; | ||
1614 | unlike other entities, the document entity has no name and might well appear | ||
1615 | on a processor input stream without any identification at all.</p></div></div><div class="div1"> <h2><a name="sec-conformance" id="sec-conformance"/>5 Conformance</h2><div class="div2"> <h3><a name="proc-types" id="proc-types"/>5.1 Validating and Non-Validating Processors</h3><p>Conforming <a title="XML Processor" href="#dt-xml-proc">XML processors</a> fall into | ||
1616 | two classes: validating and non-validating.</p><p>Validating and non-validating processors alike <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> report violations of | ||
1617 | this specification's well-formedness constraints in the content of the <a title="Document Entity" href="#dt-docent">document entity</a> and any other <a title="Text Entity" href="#dt-parsedent">parsed | ||
1618 | entities</a> that they read.</p><p> | ||
1619 | [<a name="dt-validating" id="dt-validating" title="Validating Processor">Definition</a>: | ||
1620 | <b>Validating | ||
1621 | processors</b> | ||
1622 | <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em>, | ||
1623 | at user option, report violations of the constraints expressed by | ||
1624 | the declarations in the <a title="Document Type Declaration" href="#dt-doctype">DTD</a>, and failures | ||
1625 | to fulfill the validity constraints given in this specification.] | ||
1626 | To accomplish this, validating XML processors <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> read and process the entire | ||
1627 | DTD and all external parsed entities referenced in the document.</p><p>Non-validating processors are <em class="rfc2119" title="Keyword in RFC 2119 context">REQUIRED</em> to check only the <a title="Document Entity" href="#dt-docent">document | ||
1628 | entity</a>, including the entire internal DTD subset, for well-formedness. [<a name="dt-use-mdecl" id="dt-use-mdecl" title="Process Declarations">Definition</a>: While they are not required | ||
1629 | to check the document for validity, they are <em class="rfc2119" title="Keyword in RFC 2119 context">REQUIRED</em> to <b>process</b> | ||
1630 | all the declarations they read in the internal DTD subset and in any parameter | ||
1631 | entity that they read, up to the first reference to a parameter entity that | ||
1632 | they do <em>not</em> read; that is to say, they <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> use the information | ||
1633 | in those declarations to <a href="#AVNormalize"><cite>normalize</cite></a> | ||
1634 | attribute values, <a href="#included"><cite>include</cite></a> the replacement | ||
1635 | text of internal entities, and supply <a href="#sec-attr-defaults"><cite>default | ||
1636 | attribute values</cite></a>.] Except when <code>standalone="yes"</code>, they | ||
1637 | <em class="rfc2119" title="Keyword in RFC 2119 context">MUST NOT</em> | ||
1638 | <a title="Process Declarations" href="#dt-use-mdecl">process</a> | ||
1639 | <a title="entity declaration" href="#dt-entdecl">entity | ||
1640 | declarations</a> or <a title="Attribute-List Declaration" href="#dt-attdecl">attribute-list declarations</a> | ||
1641 | encountered after a reference to a parameter entity that is not read, since | ||
1642 | the entity may have contained overriding declarations; when <code>standalone="yes"</code>, processors <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> | ||
1643 | process these declarations.</p><p>Note | ||
1644 | that when processing invalid documents with a non-validating | ||
1645 | processor the application may not be presented with consistent | ||
1646 | information. For example, several requirements for uniqueness | ||
1647 | within the document may not be met, including more than one element | ||
1648 | with the same id, duplicate declarations of elements or notations | ||
1649 | with the same name, etc. In these cases the behavior of the parser | ||
1650 | with respect to reporting such information to the application is | ||
1651 | undefined.</p><p>XML 1.1 processors <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> be able to process both XML 1.0 | ||
1652 | and XML 1.1 documents. Programs which generate XML <em class="rfc2119" title="Keyword in RFC 2119 context">SHOULD</em> | ||
1653 | generate XML 1.0, unless one of the specific features of XML 1.1 is required.</p></div><div class="div2"> <h3><a name="safe-behavior" id="safe-behavior"/>5.2 Using XML Processors</h3><p>The behavior of a validating XML processor is highly predictable; it must | ||
1654 | read every piece of a document and report all well-formedness and validity | ||
1655 | violations. Less is required of a non-validating processor; it need not read | ||
1656 | any part of the document other than the document entity. This has two effects | ||
1657 | that may be important to users of XML processors:</p><ul><li><p>Certain well-formedness errors, specifically those that require reading | ||
1658 | external entities, may fail to be detected by a non-validating processor. Examples | ||
1659 | include the constraints entitled <a href="#wf-entdeclared"><cite>Entity Declared</cite></a>, <a href="#textent"><cite>Parsed Entity</cite></a>, and <a href="#norecursion"><cite>No | ||
1660 | Recursion</cite></a>, as well as some of the cases described as <a href="#forbidden"><cite>forbidden</cite></a> in <a href="#entproc"><b>4.4 XML Processor Treatment of Entities and References</b></a>.</p></li><li><p>The information passed from the processor to the application may | ||
1661 | vary, depending on whether the processor reads parameter and external entities. | ||
1662 | For example, a non-validating processor may fail to <a href="#AVNormalize"><cite>normalize</cite></a> | ||
1663 | attribute values, <a href="#included"><cite>include</cite></a> the replacement | ||
1664 | text of internal entities, or supply <a href="#sec-attr-defaults"><cite>default | ||
1665 | attribute values</cite></a>, where doing so depends on having read declarations | ||
1666 | in external or parameter entities.</p></li></ul><p>For maximum reliability in interoperating between different XML processors, | ||
1667 | applications which use non-validating processors <em class="rfc2119" title="Keyword in RFC 2119 context">SHOULD NOT</em> rely on any behaviors | ||
1668 | not required of such processors. Applications which require DTD facilities | ||
1669 | not related to validation (such | ||
1670 | as the declaration of default attributes and internal entities that are | ||
1671 | or may be specified in | ||
1672 | external entities<span> | ||
1673 | )</span> | ||
1674 | <em class="rfc2119" title="Keyword in RFC 2119 context">SHOULD</em> use validating XML processors.</p></div></div><div class="div1"> <h2><a name="sec-notation" id="sec-notation"/>6 Notation</h2><p>The formal grammar of XML is given in this specification using a simple | ||
1675 | Extended Backus-Naur Form (EBNF) notation. Each rule in the grammar defines | ||
1676 | one symbol, in the form</p><div class="exampleInner"><pre>symbol ::= expression</pre></div><p>Symbols are written with an initial capital letter if they are the | ||
1677 | start symbol of a regular language, otherwise with an initial lowercase | ||
1678 | letter. Literal strings are quoted.</p><p>Within the expression on the right-hand side of a rule, the following expressions | ||
1679 | are used to match strings of one or more characters: </p><dl><dt class="label"> | ||
1680 | <code>#xN</code> | ||
1681 | </dt><dd><p>where <code>N</code> is a hexadecimal integer, the expression matches the character | ||
1682 | whose number | ||
1683 | (code point) in ISO/IEC 10646 is <code>N</code>. The number of leading zeros in the <code>#xN</code> | ||
1684 | form is insignificant.</p></dd><dt class="label"> | ||
1685 | <code>[a-zA-Z]</code>, <code>[#xN-#xN]</code> | ||
1686 | </dt><dd><p>matches any <a href="#NT-Char">Char</a> with a value in the range(s) indicated (inclusive).</p></dd><dt class="label"> | ||
1687 | <code>[abc]</code>, <code>[#xN#xN#xN]</code> | ||
1688 | </dt><dd><p>matches any <a href="#NT-Char">Char</a> with a value among the characters | ||
1689 | enumerated. Enumerations and ranges can be mixed in one set of brackets.</p></dd><dt class="label"> | ||
1690 | <code>[^a-z]</code>, <code>[^#xN-#xN]</code> | ||
1691 | </dt><dd><p>matches any <a href="#NT-Char">Char</a> with a value <em>outside</em> the range | ||
1692 | indicated.</p></dd><dt class="label"> | ||
1693 | <code>[^abc]</code>, <code>[^#xN#xN#xN]</code> | ||
1694 | </dt><dd><p>matches any <a href="#NT-Char">Char</a> with a value not among the characters given. Enumerations | ||
1695 | and ranges of forbidden values can be mixed in one set of brackets.</p></dd><dt class="label"> | ||
1696 | <code>"string"</code> | ||
1697 | </dt><dd><p>matches a literal string <a title="match" href="#dt-match">matching</a> that | ||
1698 | given inside the double quotes.</p></dd><dt class="label"> | ||
1699 | <code>'string'</code> | ||
1700 | </dt><dd><p>matches a literal string <a title="match" href="#dt-match">matching</a> that | ||
1701 | given inside the single quotes.</p></dd></dl><p> These symbols may be combined to match more complex patterns as follows, | ||
1702 | where <code>A</code> and <code>B</code> represent simple expressions: </p><dl><dt class="label">(<code>expression</code>)</dt><dd><p> | ||
1703 | <code>expression</code> is treated as a unit and may be combined as described | ||
1704 | in this list.</p></dd><dt class="label"> | ||
1705 | <code>A?</code> | ||
1706 | </dt><dd><p>matches <code>A</code> or nothing; optional <code>A</code>.</p></dd><dt class="label"> | ||
1707 | <code>A B</code> | ||
1708 | </dt><dd><p>matches <code>A</code> followed by <code>B</code>. This | ||
1709 | operator has higher precedence than alternation; thus <code>A B | C D</code> | ||
1710 | is identical to <code>(A B) | (C D)</code>.</p></dd><dt class="label"> | ||
1711 | <code>A | B</code> | ||
1712 | </dt><dd><p>matches <code>A</code> or <code>B</code>.</p></dd><dt class="label"> | ||
1713 | <code>A - B</code> | ||
1714 | </dt><dd><p>matches any string that matches <code>A</code> but does not match <code>B</code>.</p></dd><dt class="label"> | ||
1715 | <code>A+</code> | ||
1716 | </dt><dd><p>matches one or more occurrences of <code>A</code>. Concatenation | ||
1717 | has higher precedence than alternation; thus <code>A+ | B+</code> is identical | ||
1718 | to <code>(A+) | (B+)</code>.</p></dd><dt class="label"> | ||
1719 | <code>A*</code> | ||
1720 | </dt><dd><p>matches zero or more occurrences of <code>A</code>. Concatenation | ||
1721 | has higher precedence than alternation; thus <code>A* | B*</code> is identical | ||
1722 | to <code>(A*) | (B*)</code>.</p></dd></dl><p> Other notations used in the productions are: </p><dl><dt class="label"> | ||
1723 | <code>/* ... */</code> | ||
1724 | </dt><dd><p>comment.</p></dd><dt class="label"> | ||
1725 | <code>[ wfc: ... ]</code> | ||
1726 | </dt><dd><p>well-formedness constraint; this identifies by name a constraint on <a title="Well-Formed" href="#dt-wellformed">well-formed</a> documents associated with a production.</p></dd><dt class="label"> | ||
1727 | <code>[ vc: ... ]</code> | ||
1728 | </dt><dd><p>validity constraint; this identifies by name a constraint on <a title="Validity" href="#dt-valid">valid</a> | ||
1729 | documents associated with a production.</p></dd></dl><p> | ||
1730 | </p></div></div><div class="back"><div class="div1"> <h2><a name="sec-bibliography" id="sec-bibliography"/>A References</h2><div class="div2"> <h3><a name="sec-existing-stds" id="sec-existing-stds"/>A.1 Normative References</h3><dl><dt class="label"><a name="IANA" id="IANA"/>IANA-CHARSETS</dt><dd>(Internet | ||
1731 | Assigned Numbers Authority) <a href="http://www.iana.org/assignments/character-sets"><cite>Official Names for Character Sets</cite></a>, | ||
1732 | ed. Keld Simonsen et al. (See http://www.iana.org/assignments/character-sets.)</dd><dt class="label"><a name="rfc2119" id="rfc2119"/>IETF RFC 2119</dt><dd>IETF | ||
1733 | (Internet Engineering Task Force). <a href="http://www.ietf.org/rfc/rfc2119.txt"><cite>RFC 2119: Key words for use in RFCs to Indicate Requirement Levels</cite></a>. | ||
1734 | Scott Bradner, 1997. (See http://www.ietf.org/rfc/rfc2119.txt.)</dd><dt class="label"><a name="RFC1766" id="RFC1766"/>IETF RFC 3066</dt><dd>IETF | ||
1735 | (Internet Engineering Task Force). <a href="http://www.ietf.org/rfc/rfc3066.txt"><cite>RFC 3066: Tags for the Identification | ||
1736 | of Languages</cite></a>, ed. H. Alvestrand. 2001. (See http://www.ietf.org/rfc/rfc3066.txt.)</dd><dt class="label"><a name="rfc3986"/>IETF RFC 3986</dt><dd>IETF (Internet Engineering Task Force). <a href="http://www.ietf.org/rfc/rfc3986.txt"><cite>RFC 3986: Uniform Resource Identifier (URI): Generic Syntax</cite></a>. T. Berners-Lee, R. Fielding, L. Masinter. 2005. (See http://www.ietf.org/rfc/rfc3986.txt.)</dd><dt class="label"><a name="ISO10646" id="ISO10646"/>ISO/IEC 10646</dt><dd>ISO (International | ||
1737 | Organization for Standardization). <cite>ISO/IEC 10646-1:2000. Information | ||
1738 | technology — Universal Multiple-Octet Coded Character Set (UCS) — | ||
1739 | Part 1: Architecture and Basic Multilingual Plane</cite> and <cite>ISO/IEC 10646-2:2001. | ||
1740 | Information technology — Universal Multiple-Octet Coded Character Set (UCS) — Part 2: | ||
1741 | Supplementary Planes</cite>, as, from time to time, amended, replaced by a new edition or | ||
1742 | expanded by the addition of new parts. [Geneva]: International Organization for Standardization. | ||
1743 | (See <a href="http://www.iso.ch">http://www.iso.ch</a> for the latest version.)</dd><dt class="label"><a name="Unicode" id="Unicode"/>Unicode</dt><dd>The Unicode Consortium. <em>The Unicode | ||
1744 | Standard, Version 4.0.</em> Reading, Mass.: Addison-Wesley, | ||
1745 | 2003, | ||
1746 | as updated from time to time by the publication of new versions. (See | ||
1747 | <a href="http://www.unicode.org/unicode/standard/versions"> | ||
1748 | http://www.unicode.org/unicode/standard/versions</a> for the latest version | ||
1749 | and additional information on versions of the standard and of the Unicode | ||
1750 | Character Database).</dd><dt class="label"><a name="XML1.0" id="XML1.0"/>XML-1.0</dt><dd>W3C. <a href="http://www.w3.org/TR/xml"><cite>Extensible Markup Language (XML) 1.0 (Fourth | ||
1751 | Edition)</cite></a>. Tim Bray, Jean Paoli, C.M. Sperberg-McQueen, Eve Maler, François Yergeau | ||
1752 | (editors) (See http://www.w3.org/TR/xml.)</dd></dl></div><div class="div2"> <h3><a name="null" id="null"/>A.2 Other References</h3><dl><dt class="label"><a name="Aho" id="Aho"/>Aho/Ullman</dt><dd>Aho, Alfred V., Ravi Sethi, and Jeffrey D. | ||
1753 | Ullman. <cite>Compilers: Principles, Techniques, and Tools</cite>. | ||
1754 | Reading: Addison-Wesley, 1986, rpt. corr. 1988.</dd><dt class="label"><a name="ABK" id="ABK"/>Brüggemann-Klein</dt><dd>Brüggemann-Klein, | ||
1755 | Anne. <a href="ftp://ftp.informatik.uni-freiburg.de/documents/papers/brueggem/habil.ps"><cite>Formal Models in Document Processing</cite></a>. Habilitationsschrift. Faculty | ||
1756 | of Mathematics at the University of Freiburg, 1993. (See ftp://ftp.informatik.uni-freiburg.de/documents/papers/brueggem/habil.ps.)</dd><dt class="label"><a name="ABKDW" id="ABKDW"/>Brüggemann-Klein and Wood</dt><dd>Brüggemann-Klein, | ||
1757 | Anne, and Derick Wood. <cite>Deterministic Regular Languages</cite>. | ||
1758 | Universität Freiburg, Institut für Informatik, Bericht 38, Oktober 1991. Extended | ||
1759 | abstract in A. Finkel, M. Jantzen, Hrsg., STACS 1992, S. 173-184. Springer-Verlag, | ||
1760 | Berlin 1992. Lecture Notes in Computer Science 577. Full version titled <cite>One-Unambiguous | ||
1761 | Regular Languages</cite> in Information and Computation 140 (2): 229-253, | ||
1762 | February 1998.</dd><dt class="label"><a name="Charmod" id="Charmod"/>Charmod</dt><dd>W3C Working Draft. | ||
1763 | <a href="http://www.w3.org/TR/2003/WD-charmod-20030822/"><cite>Character Model for the World Wide Web 1.0</cite></a>. | ||
1764 | Martin J. Dürst, François Yergeau, Richard Ishida, Misha Wolf, Tex Texin. (See http://www.w3.org/TR/2003/WD-charmod-20030822/.)</dd><dt class="label"><a name="Clark" id="Clark"/>Clark</dt><dd>James Clark. | ||
1765 | <a href="http://www.w3.org/TR/NOTE-sgml-xml-971215"><cite>Comparison of SGML and XML</cite></a>. (See http://www.w3.org/TR/NOTE-sgml-xml-971215.)</dd><dt class="label"><a name="IANA-LANGCODES" id="IANA-LANGCODES"/>IANA-LANGCODES</dt><dd>(Internet | ||
1766 | Assigned Numbers Authority) <a href="http://www.iana.org/assignments/language-tags"><cite>Registry of Language Tags</cite></a>, | ||
1767 | ed. Keld Simonsen et al. (See http://www.iana.org/assignments/language-tags.)</dd><dt class="label"><a name="RFC2141" id="RFC2141"/>IETF RFC 2141</dt><dd>IETF | ||
1768 | (Internet Engineering Task Force). <a href="http://www.ietf.org/rfc/rfc2141.txt"><cite>RFC 2141: URN Syntax</cite></a>, ed. | ||
1769 | R. Moats. 1997. (See http://www.ietf.org/rfc/rfc2141.txt.)</dd><dt class="label"><a name="rfc2376" id="rfc2376"/>IETF RFC 3023</dt><dd>IETF | ||
1770 | (Internet Engineering Task Force). <a href="http://www.ietf.org/rfc/rfc3023.txt"><cite>RFC 3023: XML Media Types</cite></a>. | ||
1771 | eds. M. Murata, S. St.Laurent, D. Kohn. 2001. (See http://www.ietf.org/rfc/rfc3023.txt.)</dd><dt class="label"><a name="rfc2781" id="rfc2781"/>IETF RFC 2781</dt><dd>IETF | ||
1772 | (Internet Engineering Task Force). <a href="http://www.ietf.org/rfc/rfc2781.txt"><cite>RFC 2781: UTF-16, an encoding | ||
1773 | of ISO 10646</cite></a>, ed. P. Hoffman, F. Yergeau. 2000. (See http://www.ietf.org/rfc/rfc2781.txt.)</dd><dt class="label"><a name="ISO639" id="ISO639"/>ISO 639</dt><dd>(International Organization for Standardization). | ||
1774 | <cite>ISO 639:1988 (E). | ||
1775 | Code for the representation of names of languages.</cite> [Geneva]: International | ||
1776 | Organization for Standardization, 1988.</dd><dt class="label"><a name="ISO3166" id="ISO3166"/>ISO 3166</dt><dd>(International Organization for Standardization). | ||
1777 | <cite>ISO 3166-1:1997 | ||
1778 | (E). Codes for the representation of names of countries and their subdivisions — | ||
1779 | Part 1: Country codes</cite> [Geneva]: International Organization for | ||
1780 | Standardization, 1997.</dd><dt class="label"><a name="ISO8879" id="ISO8879"/>ISO 8879</dt><dd>ISO (International Organization for Standardization). <cite>ISO | ||
1781 | 8879:1986(E). Information processing — Text and Office Systems — | ||
1782 | Standard Generalized Markup Language (SGML).</cite> First edition — | ||
1783 | 1986-10-15. [Geneva]: International Organization for Standardization, 1986. </dd><dt class="label"><a name="ISO10744" id="ISO10744"/>ISO/IEC 10744</dt><dd>ISO (International Organization for | ||
1784 | Standardization). <cite>ISO/IEC 10744-1992 (E). Information technology — | ||
1785 | Hypermedia/Time-based Structuring Language (HyTime). </cite> [Geneva]: | ||
1786 | International Organization for Standardization, 1992. <em>Extended Facilities | ||
1787 | Annexe.</em> [Geneva]: International Organization for Standardization, 1996. </dd><dt class="label"><a name="websgml" id="websgml"/>WEBSGML</dt><dd>ISO | ||
1788 | (International Organization for Standardization). <a href="http://www.sgmlsource.com/8879/n0029.htm"><cite>ISO 8879:1986 | ||
1789 | TC2. Information technology — Document Description and Processing Languages</cite></a>. | ||
1790 | [Geneva]: International Organization for Standardization, 1998. (See http://www.sgmlsource.com/8879/n0029.htm.)</dd><dt class="label"><a name="xml-names" id="xml-names"/>XML Names</dt><dd>Tim Bray, | ||
1791 | Dave Hollander, and Andrew Layman, editors. <a href="http://www.w3.org/TR/REC-xml-names/"><cite>Namespaces in XML</cite></a>. | ||
1792 | Textuality, Hewlett-Packard, and Microsoft. World Wide Web Consortium, 1999. (See http://www.w3.org/TR/REC-xml-names/.)</dd></dl></div></div><div class="div1"> <h2><a name="sec-CharNorm" id="sec-CharNorm"/>B Definitions for Character Normalization</h2><p>This appendix contains the necessary definitions for character normalization. | ||
1793 | For additional background information and examples, see <a href="#Charmod">[Charmod]</a>.</p><p> | ||
1794 | |||
1795 | [<a name="dt-Uni-encform" id="dt-Uni-encform" title="Unicode encoding form">Definition</a>: Text is said to be | ||
1796 | in a <b>Unicode encoding form</b> if it is encoded in | ||
1797 | UTF-8, UTF-16 or UTF-32.] | ||
1798 | </p><p> | ||
1799 | |||
1800 | [<a name="dt-legacyenc" id="dt-legacyenc" title="legacy encoding">Definition</a>: | ||
1801 | <b>Legacy encoding</b> | ||
1802 | is taken to mean any character encoding not based on Unicode.] | ||
1803 | </p><p> | ||
1804 | |||
1805 | [<a name="dt-normtransc" id="dt-normtransc" title="normalizing transcoder">Definition</a>: A | ||
1806 | <b>normalizing transcoder</b> is a transcoder that converts from a | ||
1807 | <a title="legacy encoding" href="#dt-legacyenc">legacy encoding</a> to a | ||
1808 | <a title="Unicode encoding form" href="#dt-Uni-encform">Unicode encoding form</a> and | ||
1809 | ensures that the result is in Unicode Normalization Form C | ||
1810 | (see UAX #15 <a href="#Unicode">[Unicode]</a>).] | ||
1811 | </p><p> | ||
1812 | [<a name="dt-charesc" id="dt-charesc" title="character escape">Definition</a>: A <b>character escape</b> | ||
1813 | is a syntactic device defined in a markup or programming language that allows | ||
1814 | one or more of:] | ||
1815 | </p><ol class="enumar"><li><p>expressing syntax-significant characters while disregarding | ||
1816 | their significance in the syntax of the language, or</p></li><li><p>expressing characters not representable in the character encoding | ||
1817 | chosen for an instance of the language, or</p></li><li><p>expressing characters in general, without use of the corresponding | ||
1818 | character codes.</p></li></ol><p> | ||
1819 | |||
1820 | [<a name="dt-certified" id="dt-certified" title="certified">Definition</a>: | ||
1821 | <b>Certified</b> text | ||
1822 | is text which satisfies at least one of the following conditions:] | ||
1823 | </p><ol class="enumar"><li><p>it has been confirmed through inspection that the text | ||
1824 | is in normalized form</p></li><li><p>the source text-processing component is identified | ||
1825 | and is known to produce only normalized text.</p></li></ol><p> | ||
1826 | |||
1827 | [<a name="dt-uninorm" id="dt-uninorm" title="Unicode-normalized">Definition</a>: Text is, for the purposes of | ||
1828 | this specification, <b>Unicode-normalized</b> if it is in a | ||
1829 | <a title="Unicode encoding form" href="#dt-Uni-encform">Unicode encoding form</a> and is in | ||
1830 | Unicode Normalization Form C, according to a version of Unicode Standard Annex #15: | ||
1831 | Unicode Normalization Forms <a href="#Unicode">[Unicode]</a> at least as recent as the | ||
1832 | oldest version of the Unicode Standard that contains all the characters | ||
1833 | actually present in the text, but no earlier | ||
1834 | than version 3.2.] | ||
1835 | </p><p> | ||
1836 | |||
1837 | [<a name="dt-inclnorm" id="dt-inclnorm" title="include-normalized">Definition</a>: Text is | ||
1838 | <b>include-normalized</b> if:] | ||
1839 | </p><ol class="enumar"><li><p>the text is <a title="Unicode-normalized" href="#dt-uninorm">Unicode-normalized</a> | ||
1840 | and does not contain any <a title="character escape" href="#dt-charesc">character escapes</a> | ||
1841 | or <a title="Include" href="#dt-include">includes</a> whose expansion would | ||
1842 | cause the text to become no longer <a title="Unicode-normalized" href="#dt-uninorm">Unicode-normalized</a>; | ||
1843 | or</p></li><li><p>the text is in a <a title="legacy encoding" href="#dt-legacyenc">legacy encoding</a> and, if it were transcoded | ||
1844 | to a <a title="Unicode encoding form" href="#dt-Uni-encform">Unicode encoding form</a> by a | ||
1845 | <a title="normalizing transcoder" href="#dt-normtransc">normalizing transcoder</a>, the resulting | ||
1846 | text would satisfy clause 1 above.</p></li></ol><p> | ||
1847 | |||
1848 | [<a name="dt-compchar" id="dt-compchar" title="composing character">Definition</a>: A <b>composing character</b> | ||
1849 | is a character that is one or both of the following:] | ||
1850 | </p><ol class="enumar"><li><p>the second character in the canonical decomposition mapping of | ||
1851 | some primary composite (as defined in D3 of UAX #15 <a href="#Unicode">[Unicode]</a>), or</p></li><li><p>of non-zero canonical combining class (as defined in Unicode | ||
1852 | <a href="#Unicode">[Unicode]</a>).</p></li></ol><p> | ||
1853 | |||
1854 | [<a name="dt-fullnorm" id="dt-fullnorm" title="fully normalized">Definition</a>: Text is | ||
1855 | <b>fully-normalized</b> if:] | ||
1856 | </p><ol class="enumar"><li><p>the text is in a <a title="Unicode encoding form" href="#dt-Uni-encform">Unicode encoding | ||
1857 | form</a>, is <a title="include-normalized" href="#dt-inclnorm">include-normalized</a> and | ||
1858 | none of the <a title="" href="#dt-relconst">relevant | ||
1859 | constructs</a> comprising the text begin with a | ||
1860 | <a title="composing character" href="#dt-compchar">composing character</a> or a | ||
1861 | character escape representing a | ||
1862 | <a title="composing character" href="#dt-compchar">composing character</a>; or</p></li><li><p>the text is in a <a title="legacy encoding" href="#dt-legacyenc">legacy encoding</a> and, | ||
1863 | if it were transcoded to a <a title="Unicode encoding form" href="#dt-Uni-encform">Unicode encoding form</a> | ||
1864 | by a <a title="normalizing transcoder" href="#dt-normtransc">normalizing transcoder</a>, the resulting text | ||
1865 | would satisfy clause 1 above.</p></li></ol></div><div class="div1"> <h2><a name="sec-entexpand" id="sec-entexpand"/>C Expansion of Entity and Character References (Non-Normative)</h2><p>This appendix contains some examples illustrating the sequence of entity- | ||
1866 | and character-reference recognition and expansion, as specified in <a href="#entproc"><b>4.4 XML Processor Treatment of Entities and References</b></a>.</p><p>If the DTD contains the declaration</p><div class="exampleInner"><pre><!ENTITY example "<p>An ampersand (&#38;#38;) may be escaped | ||
1867 | numerically (&#38;#38;#38;) or with a general entity | ||
1868 | (&amp;amp;).</p>" ></pre></div><p>then the XML processor will recognize the character references when it | ||
1869 | parses the entity declaration, and resolve them before storing the following | ||
1870 | string as the value of the entity "<code>example</code>":</p><div class="exampleInner"><pre><p>An ampersand (&#38;) may be escaped | ||
1871 | numerically (&#38;#38;) or with a general entity | ||
1872 | (&amp;amp;).</p></pre></div><p>A reference in the document to "<code>&example;</code>" | ||
1873 | will cause the text to be reparsed, at which time the start- and end-tags | ||
1874 | of the <code>p</code> element will be recognized and the three references will | ||
1875 | be recognized and expanded, resulting in a <code>p</code> element with the following | ||
1876 | content (all data, no delimiters or markup):</p><div class="exampleInner"><pre>An ampersand (&) may be escaped | ||
1877 | numerically (&#38;) or with a general entity | ||
1878 | (&amp;).</pre></div><p>A more complex example will illustrate the rules and their effects fully. | ||
1879 | In the following example, the line numbers are solely for reference.</p><div class="exampleInner"><pre>1 <?xml version='1.<span>1</span>'?> | ||
1880 | 2 <!DOCTYPE test [ | ||
1881 | 3 <!ELEMENT test (#PCDATA) > | ||
1882 | 4 <!ENTITY % xx '&#37;zz;'> | ||
1883 | 5 <!ENTITY % zz '&#60;!ENTITY tricky "error-prone" >' > | ||
1884 | 6 %xx; | ||
1885 | 7 ]> | ||
1886 | 8 <test>This sample shows a &tricky; method.</test></pre></div><p>This produces the following:</p><ul><li><p>in line 4, the reference to character 37 is expanded immediately, | ||
1887 | and the parameter entity "<code>xx</code>" is stored in the symbol | ||
1888 | table with the value "<code>%zz;</code>". Since the replacement | ||
1889 | text is not rescanned, the reference to parameter entity "<code>zz</code>" | ||
1890 | is not recognized. (And it would be an error if it were, since "<code>zz</code>" | ||
1891 | is not yet declared.)</p></li><li><p>in line 5, the character reference "<code>&#60;</code>" | ||
1892 | is expanded immediately and the parameter entity "<code>zz</code>" | ||
1893 | is stored with the replacement text "<code><!ENTITY tricky "error-prone"></code>", which is a well-formed entity declaration.</p></li><li><p>in line 6, the reference to "<code>xx</code>" is recognized, | ||
1894 | and the replacement text of "<code>xx</code>" (namely "<code>%zz;</code>") | ||
1895 | is parsed. The reference to "<code>zz</code>" is recognized in | ||
1896 | its turn, and its replacement text ("<code><!ENTITY tricky "error-prone"></code>") is parsed. The general entity "<code>tricky</code>" | ||
1897 | has now been declared, with the replacement text "<code>error-prone</code>".</p></li><li><p>in line 8, the reference to the general entity "<code>tricky</code>" | ||
1898 | is recognized, and it is expanded, so the full content of the <code>test</code> | ||
1899 | element is the self-describing (and ungrammatical) string <em>This sample | ||
1900 | shows a error-prone method.</em> | ||
1901 | </p></li></ul></div><div class="div1"> <h2><a name="determinism" id="determinism"/>D Deterministic Content Models (Non-Normative)</h2><p>As | ||
1902 | noted in <a href="#sec-element-content"><b>3.2.1 Element Content</b></a>, it is required that content | ||
1903 | models in element type declarations be deterministic. This requirement is <a title="For Compatibility" href="#dt-compat">for compatibility</a> with SGML (which calls deterministic | ||
1904 | content models "unambiguous"); XML processors built | ||
1905 | using SGML systems may flag non-deterministic content models as errors.</p><p>For example, the content model <code>((b, c) | (b, d))</code> is non-deterministic, | ||
1906 | because given an initial <code>b</code> the XML processor | ||
1907 | cannot know which <code>b</code> in the model is being matched without looking | ||
1908 | ahead to see which element follows the <code>b</code>. In this case, the two references | ||
1909 | to <code>b</code> can be collapsed into a single reference, making the model read <code>(b, | ||
1910 | (c | d))</code>. An initial <code>b</code> now clearly matches only a single name | ||
1911 | in the content model. The processor doesn't need to look ahead to see what follows; either <code>c</code> or <code>d</code> | ||
1912 | would be accepted.</p><p>More formally: a finite state automaton may be constructed from the content | ||
1913 | model using the standard algorithms, e.g. algorithm 3.5 in section 3.9 of | ||
1914 | Aho, Sethi, and Ullman <a href="#Aho">[Aho/Ullman]</a>. In many such algorithms, a follow | ||
1915 | set is constructed for each position in the regular expression (i.e., each | ||
1916 | leaf node in the syntax tree for the regular expression); if any position | ||
1917 | has a follow set in which more than one following position is labeled with | ||
1918 | the same element type name, then the content model is in error and may be | ||
1919 | reported as an error.</p><p>Algorithms exist which allow many but not all non-deterministic content | ||
1920 | models to be reduced automatically to equivalent deterministic models; see | ||
1921 | Brüggemann-Klein 1991 <a href="#ABK">[Brüggemann-Klein]</a>.</p></div><div class="div1"> <h2><a name="sec-guessing" id="sec-guessing"/>E Autodetection of Character Encodings (Non-Normative)</h2><p>The XML encoding declaration functions as an internal label on each entity, | ||
1922 | indicating which character encoding is in use. Before an XML processor can | ||
1923 | read the internal label, however, it apparently has to know what character | ||
1924 | encoding is in use — which is what the internal label is trying to indicate. | ||
1925 | In the general case, this is a hopeless situation. It is not entirely hopeless | ||
1926 | in XML, however, because XML limits the general case in two ways: each implementation | ||
1927 | is assumed to support only a finite set of character encodings, and the XML | ||
1928 | encoding declaration is restricted in position and content in order to make | ||
1929 | it feasible to autodetect the character encoding in use in each entity in | ||
1930 | normal cases. Also, in many cases other sources of information are available | ||
1931 | in addition to the XML data stream itself. Two cases may be distinguished, | ||
1932 | depending on whether the XML entity is presented to the processor without, | ||
1933 | or with, any accompanying (external) information. We consider the first case | ||
1934 | first.</p><div class="div2"> <h3><a name="sec-guessing-no-ext-info" id="sec-guessing-no-ext-info"/>E.1 Detection Without External Encoding Information</h3><p>Because each XML entity not accompanied by external | ||
1935 | encoding information and not in UTF-8 or UTF-16 encoding must | ||
1936 | begin with an XML encoding declaration, in which the first characters must | ||
1937 | be '<code><?xml</code>', any conforming processor can detect, after two | ||
1938 | to four octets of input, which of the following cases apply. In reading this | ||
1939 | list, it may help to know that in UCS-4, '<' is "<code>#x0000003C</code>" | ||
1940 | and '?' is "<code>#x0000003F</code>", and the Byte Order Mark | ||
1941 | required of UTF-16 data streams is "<code>#xFEFF</code>". The notation | ||
1942 | <var>##</var> is used to denote any byte value except that two consecutive | ||
1943 | <var>##</var>s cannot be both 00.</p><p>With a Byte Order Mark:</p><table border="1" frame="border" summary="Encoding detection summary"><tbody><tr><td> | ||
1944 | <code>00 00 FE | ||
1945 | FF</code> | ||
1946 | </td><td>UCS-4, big-endian machine (1234 order)</td></tr><tr><td> | ||
1947 | <code>FF | ||
1948 | FE 00 00</code> | ||
1949 | </td><td>UCS-4, little-endian machine (4321 order)</td></tr><tr><td> | ||
1950 | <code>00 00 FF FE</code> | ||
1951 | </td><td>UCS-4, unusual octet order (2143)</td></tr><tr><td> | ||
1952 | <code>FE FF 00 00</code> | ||
1953 | </td><td>UCS-4, unusual octet order (3412)</td></tr><tr><td> | ||
1954 | <code>FE FF ## ##</code> | ||
1955 | </td><td>UTF-16, big-endian</td></tr><tr><td> | ||
1956 | <code>FF FE ## ##</code> | ||
1957 | </td><td>UTF-16, little-endian</td></tr><tr><td> | ||
1958 | <code>EF BB BF</code> | ||
1959 | </td><td>UTF-8</td></tr></tbody></table><p>Without a Byte Order Mark:</p><table border="1" frame="border" summary="Encoding detection summary"><tbody><tr><td> | ||
1960 | <code>00 00 00 3C</code> | ||
1961 | </td><td rowspan="4">UCS-4 or other encoding with a 32-bit code unit and ASCII | ||
1962 | characters encoded as ASCII values, in respectively big-endian (1234), little-endian | ||
1963 | (4321) and two unusual byte orders (2143 and 3412). The encoding declaration | ||
1964 | must be read to determine which of UCS-4 or other supported 32-bit encodings | ||
1965 | applies.</td></tr><tr><td> | ||
1966 | <code>3C 00 00 00</code> | ||
1967 | </td></tr><tr><td> | ||
1968 | <code>00 00 3C 00</code> | ||
1969 | </td></tr><tr><td> | ||
1970 | <code>00 3C 00 00</code> | ||
1971 | </td></tr><tr><td> | ||
1972 | <code>00 3C 00 3F</code> | ||
1973 | </td><td>UTF-16BE or big-endian ISO-10646-UCS-2 | ||
1974 | or other encoding with a 16-bit code unit in big-endian order and ASCII characters | ||
1975 | encoded as ASCII values (the encoding declaration must be read to determine | ||
1976 | which)</td></tr><tr><td> | ||
1977 | <code>3C 00 3F 00</code> | ||
1978 | </td><td>UTF-16LE or little-endian | ||
1979 | ISO-10646-UCS-2 or other encoding with a 16-bit code unit in little-endian | ||
1980 | order and ASCII characters encoded as ASCII values (the encoding declaration | ||
1981 | must be read to determine which)</td></tr><tr><td> | ||
1982 | <code>3C 3F 78 6D</code> | ||
1983 | </td><td>UTF-8, ISO 646, ASCII, some part of ISO 8859, Shift-JIS, EUC, or any other | ||
1984 | 7-bit, 8-bit, or mixed-width encoding which ensures that the characters of | ||
1985 | ASCII have their normal positions, width, and values; the actual encoding | ||
1986 | declaration must be read to detect which of these applies, but since all of | ||
1987 | these encodings use the same bit patterns for the relevant ASCII characters, | ||
1988 | the encoding declaration itself may be read reliably</td></tr><tr><td> | ||
1989 | <code>4C | ||
1990 | 6F A7 94</code> | ||
1991 | </td><td>EBCDIC (in some flavor; the full encoding declaration | ||
1992 | must be read to tell which code page is in use)</td></tr><tr><td>Other</td><td>UTF-8 without an encoding declaration, or else the data stream is mislabeled | ||
1993 | (lacking a required encoding declaration), corrupt, fragmentary, or enclosed | ||
1994 | in a wrapper of some kind</td></tr></tbody></table><div class="note"><p class="prefix"><b>Note:</b></p><p>In cases above which do not require reading the encoding declaration to | ||
1995 | determine the encoding, section 4.3.3 still requires that the encoding declaration, | ||
1996 | if present, be read and that the encoding name be checked to match the actual | ||
1997 | encoding of the entity. Also, it is possible that new character encodings | ||
1998 | will be invented that will make it necessary to use the encoding declaration | ||
1999 | to determine the encoding, in cases where this is not required at present.</p></div><p>This level of autodetection is enough to read the XML encoding declaration | ||
2000 | and parse the character-encoding identifier, which is still necessary to distinguish | ||
2001 | the individual members of each family of encodings (e.g. to tell UTF-8 from | ||
2002 | 8859, and the parts of 8859 from each other, or to distinguish the specific | ||
2003 | EBCDIC code page in use, and so on).</p><p>Because the contents of the encoding declaration are restricted to characters | ||
2004 | from the ASCII repertoire (however encoded), | ||
2005 | a processor can reliably read the entire encoding declaration as soon as it | ||
2006 | has detected which family of encodings is in use. Since in practice, all widely | ||
2007 | used character encodings fall into one of the categories above, the XML encoding | ||
2008 | declaration allows reasonably reliable in-band labeling of character encodings, | ||
2009 | even when external sources of information at the operating-system or transport-protocol | ||
2010 | level are unreliable. Character encodings such as UTF-7 | ||
2011 | that make overloaded usage of ASCII-valued bytes may fail to be reliably detected.</p><p>Once the processor has detected the character encoding in use, it can act | ||
2012 | appropriately, whether by invoking a separate input routine for each case, | ||
2013 | or by calling the proper conversion function on each character of input.</p><p>Like any self-labeling system, the XML encoding declaration will not work | ||
2014 | if any software changes the entity's character set or encoding without updating | ||
2015 | the encoding declaration. Implementors of character-encoding routines should | ||
2016 | be careful to ensure the accuracy of the internal and external information | ||
2017 | used to label the entity.</p></div><div class="div2"> <h3><a name="sec-guessing-with-ext-info" id="sec-guessing-with-ext-info"/>E.2 Priorities in the Presence of External Encoding Information</h3><p>The second possible case occurs when the XML entity is accompanied by encoding | ||
2018 | information, as in some file systems and some network protocols. When multiple | ||
2019 | sources of information are available, their relative priority and the preferred | ||
2020 | method of handling conflict should be specified as part of the higher-level | ||
2021 | protocol used to deliver XML. In particular, please refer | ||
2022 | to <a href="#rfc2376">[IETF RFC 3023]</a> or its successor, which defines the <code>text/xml</code> | ||
2023 | and <code>application/xml</code> MIME types and provides some useful guidance. | ||
2024 | In the interests of interoperability, however, the following rule is recommended.</p><ul><li><p>If an XML entity is in a file, the Byte-Order Mark and encoding declaration are used | ||
2025 | (if present) to determine the character encoding.</p></li></ul></div></div><div class="div1"> <h2><a name="sec-xml-wg" id="sec-xml-wg"/>F W3C XML Working Group (Non-Normative)</h2><p>This specification was prepared and approved for publication by the W3C | ||
2026 | XML Working Group (WG). WG approval of this specification does not necessarily | ||
2027 | imply that all WG participants voted for its approval. The current and former members | ||
2028 | in the XML WG are:</p><ul><li>Jon Bosak, Sun (<i>Chair</i>) </li><li>James Clark (<i>Technical Lead</i>) </li><li>Tim Bray, Textuality and Netscape (<i>XML Co-editor</i>) </li><li>Jean Paoli, Microsoft (<i>XML | ||
2029 | Co-editor</i>) </li><li>C. M. Sperberg-McQueen, U. of Ill. (<i>XML Co-editor</i>) </li><li>Dan Connolly, W3C (<i>W3C Liaison</i>) </li><li>Paula Angerstein, Texcel</li><li>Steve DeRose, INSO</li><li>Dave Hollander, HP</li><li>Eliot Kimber, ISOGEN</li><li>Eve Maler, ArborText</li><li>Tom Magliery, NCSA</li><li>Murray Maloney, SoftQuad, Grif | ||
2030 | SA, Muzmo and Veo Systems</li><li>MURATA Makoto (FAMILY Given), Fuji | ||
2031 | Xerox Information Systems</li><li>Joel Nava, Adobe</li><li>Conleth O'Connell, Vignette</li><li>Peter Sharpe, SoftQuad</li><li>John Tigue, DataChannel</li></ul></div><div class="div1"> <h2><a name="sec-core-wg" id="sec-core-wg"/>G W3C XML Core Working Group (Non-Normative)</h2><p>The <span>second</span> edition of this specification was prepared by the W3C XML Core | ||
2032 | Working Group (WG). The participants in the WG at the time of publication of this | ||
2033 | edition were:</p><ul><li>Leonid Arbouzov, Sun Microsystems</li><li>John Cowan</li><li>Andrew Fang, PTC-Arbortext</li><li>Paul Grosso, PTC-Arbortext (<i>Co-Chair</i>) </li><li>Konrad Lanz, A-SIT</li><li>Philippe Le Hégaret, W3C (<i>Staff Contact</i>) </li><li>Glenn Marcy, IBM</li><li>Sandra Martinez, NIST</li><li>Ravindrakumar R, CDAC</li><li>Lew Shannon</li><li>Henry Thompson, W3C (<i>Staff Contact</i>) </li><li>Richard Tobin, University of Edinburgh</li><li>Daniel Veillard</li><li>Norman Walsh, Sun Microsystems (<i>Co-Chair</i>) </li><li>François Yergeau</li></ul></div><div class="div1"> <h2><a name="prod-notes" id="prod-notes"/>H Production Notes (Non-Normative)</h2><p>This edition was encoded in a | ||
2034 | slightly modified version of the | ||
2035 | <a href="http://www.w3.org/2002/xmlspec/dtd/2.10/xmlspec.dtd">XMLspec DTD, 2.10</a>. | ||
2036 | The XHTML versions were produced with a combination of the | ||
2037 | <a href="http://www.w3.org/2002/xmlspec/xhtml/1.13/xmlspec.xsl">xmlspec.xsl</a>, | ||
2038 | <a href="http://www.w3.org/2002/xmlspec/xhtml/1.13/diffspec.xsl">diffspec.xsl</a>, | ||
2039 | and <a href="REC-xml.xsl">REC-xml.xsl</a> | ||
2040 | XSLT stylesheets.</p></div><div class="div1"> <h2><a name="sec-suggested-names" id="sec-suggested-names"/>I Suggestions for XML Names (Non-Normative)</h2><p>The following suggestions define what is believed to be best | ||
2041 | practice in the construction of XML names used as element names, | ||
2042 | attribute names, processing instruction targets, entity names, | ||
2043 | notation names, and the values of attributes of type ID, and are | ||
2044 | intended as guidance for document authors and schema designers. | ||
2045 | All references to Unicode are understood with respect to | ||
2046 | a particular version of the Unicode Standard greater than or equal | ||
2047 | to 3.0; which version should be used is left to the discretion of | ||
2048 | the document author or schema designer.</p><p>The first two suggestions are directly derived from the rules | ||
2049 | given for identifiers in the Unicode Standard, version 3.0, and | ||
2050 | exclude all control characters, enclosing nonspacing marks, | ||
2051 | non-decimal numbers, private-use characters, punctuation characters | ||
2052 | (with the noted exceptions), symbol characters, unassigned | ||
2053 | codepoints, and white space characters. The other suggestions | ||
2054 | are mostly derived from <a href="#XML1.0">[XML-1.0]</a> Appendix B.</p><ol class="enumar"><li><p>The first character of any name should have a Unicode General | ||
2055 | Category of Ll, Lu, Lo, Lm, Lt, or Nl, or else be '_' #x5F.</p></li><li><p>Characters other than the first should have a Unicode General | ||
2056 | Category of Ll, Lu, Lo, Lm, Lt, Mc, Mn, Nl, Nd, Pc, or Cf, or else | ||
2057 | be one of the following: '-' #x2D, '.' #x2E, ':' #x3A or | ||
2058 | '·' #xB7 (middle dot). Since Cf characters are not | ||
2059 | directly visible, they should be employed with caution and only | ||
2060 | when necessary, to avoid creating names which are distinct to XML | ||
2061 | processors but look the same to human beings.</p></li><li><p>Ideographic characters which have a canonical decomposition | ||
2062 | (including those in the ranges [#xF900-#xFAFF] and | ||
2063 | [#x2F800-#x2FFFD], with 12 exceptions) should not be used in names. | ||
2064 | </p></li><li><p>Characters which have a compatibility decomposition (those with | ||
2065 | a "compatibility formatting tag" in field 5 of the Unicode | ||
2066 | Character Database -- marked by field 5 beginning with a "<") | ||
2067 | should not be used in names. This suggestion does not apply | ||
2068 | to #x0E33 THAI CHARACTER SARA AM or #x0EB3 LAO CHARACTER AM, which | ||
2069 | despite their compatibility decompositions are in regular use in | ||
2070 | those scripts.</p></li><li><p>Combining characters meant for use with symbols only (including | ||
2071 | those in the ranges [#x20D0-#x20EF] and [#x1D165-#x1D1AD]) should | ||
2072 | not be used in names.</p></li><li><p>The interlinear annotation characters ([#xFFF9-#xFFFB<span> | ||
2073 | ]</span>) should | ||
2074 | not be used in names.</p></li><li><p>Variation selector characters should not be used in names.</p></li><li><p>Names which are nonsensical, unpronounceable, hard to read, or | ||
2075 | easily confusable with other names should not be employed.</p></li></ol></div></div></body></html> | ||
diff --git a/misc/xml-grammar b/misc/xml-grammar new file mode 100644 index 0000000..023e166 --- /dev/null +++ b/misc/xml-grammar | |||
@@ -0,0 +1,7 @@ | |||
1 | document ::= prolog element misc* | ||
2 | |||
3 | prolog ::= XMLDecl misc* (doctypedecl misc*)? | ||
4 | |||
5 | XMLDecl ::= '<?xml' VersionInfo EncodingDecl? SDDecl? ws? '?>' | ||
6 | |||
7 | VersionInfo ::= ws | ||
diff --git a/src/fstring.h b/src/fstring.h index 8b22160..fb4bf55 100644 --- a/src/fstring.h +++ b/src/fstring.h | |||
@@ -187,7 +187,26 @@ namespace Bu | |||
187 | append( &cData, 1 ); | 187 | append( &cData, 1 ); |
188 | } | 188 | } |
189 | } | 189 | } |
190 | 190 | ||
191 | /** | ||
192 | * Append another FString to this one. | ||
193 | *@param sData (MyType &) The FString to append. | ||
194 | */ | ||
195 | void append( const MyType & sData ) | ||
196 | { | ||
197 | append( sData.getStr(), sData.getSize() ); | ||
198 | } | ||
199 | |||
200 | /** | ||
201 | * Append another FString to this one. | ||
202 | *@param sData (MyType &) The FString to append. | ||
203 | *@param nLen How much data to append. | ||
204 | */ | ||
205 | void append( const MyType & sData, long nLen ) | ||
206 | { | ||
207 | append( sData.getStr(), nLen ); | ||
208 | } | ||
209 | |||
191 | /** | 210 | /** |
192 | * Prepend another FString to this one. | 211 | * Prepend another FString to this one. |
193 | *@param sData (MyType &) The FString to prepend. | 212 | *@param sData (MyType &) The FString to prepend. |
diff --git a/src/unit/xml.cpp b/src/unit/xml.cpp new file mode 100644 index 0000000..e845cc1 --- /dev/null +++ b/src/unit/xml.cpp | |||
@@ -0,0 +1,39 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2007-2008 Xagasoft, All rights reserved. | ||
3 | * | ||
4 | * This file is part of the libbu++ library and is released under the | ||
5 | * terms of the license contained in the file LICENSE. | ||
6 | */ | ||
7 | |||
8 | #include "bu/fstring.h" | ||
9 | #include "bu/unitsuite.h" | ||
10 | #include "bu/xmlreader.h" | ||
11 | #include "bu/membuf.h" | ||
12 | |||
13 | class Unit : public Bu::UnitSuite | ||
14 | { | ||
15 | public: | ||
16 | Unit() | ||
17 | { | ||
18 | setName("Xml"); | ||
19 | addTest( Unit::declaration ); | ||
20 | } | ||
21 | |||
22 | virtual ~Unit() | ||
23 | { | ||
24 | } | ||
25 | |||
26 | void declaration() | ||
27 | { | ||
28 | Bu::FString sXml("<?xml ?> <hi />"); | ||
29 | Bu::MemBuf buf( sXml ); | ||
30 | Bu::XmlReader xr( buf ); | ||
31 | } | ||
32 | |||
33 | }; | ||
34 | |||
35 | int main( int argc, char *argv[] ) | ||
36 | { | ||
37 | return Unit().run( argc, argv ); | ||
38 | } | ||
39 | |||
diff --git a/src/xmlreader.cpp b/src/xmlreader.cpp new file mode 100644 index 0000000..9d299e6 --- /dev/null +++ b/src/xmlreader.cpp | |||
@@ -0,0 +1,165 @@ | |||
1 | #include "bu/xmlreader.h" | ||
2 | #include "bu/stream.h" | ||
3 | #include "bu/exceptions.h" | ||
4 | |||
5 | Bu::XmlReader::XmlReader( Stream &rInput ) : | ||
6 | rInput( rInput ), | ||
7 | iCurToken( 0 ), | ||
8 | iNextToken( 0 ), | ||
9 | bIgnoreWS( true ) | ||
10 | { | ||
11 | nextToken(); | ||
12 | stDocument(); | ||
13 | } | ||
14 | |||
15 | Bu::XmlReader::~XmlReader() | ||
16 | { | ||
17 | } | ||
18 | |||
19 | void Bu::XmlReader::fillBuffer() | ||
20 | { | ||
21 | if( rInput.isEOS() ) | ||
22 | return; | ||
23 | char buf[1024]; | ||
24 | int iSize = rInput.read( buf, 1024 ); | ||
25 | sBuf.append( buf, iSize ); | ||
26 | } | ||
27 | |||
28 | void Bu::XmlReader::cleanupBuffer( int iUsed ) | ||
29 | { | ||
30 | for( int j = 0; j < iUsed; j++ ) | ||
31 | { | ||
32 | if( sBuf[j] == '\n' ) | ||
33 | { | ||
34 | spNextToken.iLine++; | ||
35 | spNextToken.iChar = 1; | ||
36 | } | ||
37 | else | ||
38 | { | ||
39 | spNextToken.iChar++; | ||
40 | } | ||
41 | } | ||
42 | |||
43 | printf("--Deleting %d bytes from front of buffer.\n", iUsed ); | ||
44 | sBuf.trimFront( iUsed ); | ||
45 | } | ||
46 | |||
47 | int Bu::XmlReader::nextToken() | ||
48 | { | ||
49 | fillBuffer(); | ||
50 | |||
51 | int iUsed = 1; | ||
52 | |||
53 | iCurToken = iNextToken; | ||
54 | spCurToken = spNextToken; | ||
55 | |||
56 | switch( sBuf[0] ) | ||
57 | { | ||
58 | case '<': | ||
59 | if( !strncmp( sBuf.getStr(), "<?xml", 5 ) ) | ||
60 | { | ||
61 | iNextToken = tokXmlDeclHead; | ||
62 | iUsed = 5; | ||
63 | } | ||
64 | else | ||
65 | { | ||
66 | iNextToken = '<'; | ||
67 | } | ||
68 | break; | ||
69 | |||
70 | case '?': | ||
71 | if( sBuf[1] == '>' ) | ||
72 | { | ||
73 | iNextToken = tokXmlDeclEnd; | ||
74 | iUsed = 2; | ||
75 | } | ||
76 | else | ||
77 | { | ||
78 | iNextToken = '?'; | ||
79 | } | ||
80 | break; | ||
81 | |||
82 | case ' ': | ||
83 | case '\t': | ||
84 | case '\n': | ||
85 | case '\r': | ||
86 | for( int j = 1;; j++ ) | ||
87 | { | ||
88 | if( j == sBuf.getSize() ) | ||
89 | { | ||
90 | if( rInput.isEOS() ) | ||
91 | error("Reached end of input while waiting for whitespace to end."); | ||
92 | |||
93 | fillBuffer(); | ||
94 | } | ||
95 | if( sBuf[j] == ' ' || sBuf[j] == '\t' || | ||
96 | sBuf[j] == '\n' || sBuf[j] == '\r' ) | ||
97 | iUsed++; | ||
98 | else | ||
99 | break; | ||
100 | } | ||
101 | sStr.clear(); | ||
102 | sStr.append( sBuf, iUsed ); | ||
103 | iNextToken = tokWS; | ||
104 | break; | ||
105 | |||
106 | case '=': | ||
107 | iNextToken = sBuf[0]; | ||
108 | break; | ||
109 | |||
110 | default: | ||
111 | if( (sBuf[0] >= 'a' && sBuf[0] <= 'z') || | ||
112 | (sBuf[0] >= 'A' && sBuf[0] <= 'Z') ) | ||
113 | { | ||
114 | for( int j = 1;; j++ ) | ||
115 | { | ||
116 | if( j == sBuf.getSize() ) | ||
117 | { | ||
118 | if( rInput.isEOS() ) | ||
119 | error("Reached end of input while waiting for a string to end."); | ||
120 | |||
121 | fillBuffer(); | ||
122 | } | ||
123 | if( (sBuf[j] >= 'a' && sBuf[j] <= 'z') || | ||
124 | (sBuf[j] >= 'A' && sBuf[j] <= 'Z') ) | ||
125 | iUsed++; | ||
126 | else | ||
127 | break; | ||
128 | } | ||
129 | sStr.clear(); | ||
130 | sStr.append( sBuf, iUsed ); | ||
131 | iNextToken = tokIdent; | ||
132 | } | ||
133 | } | ||
134 | |||
135 | cleanupBuffer( iUsed ); | ||
136 | |||
137 | return iCurToken; | ||
138 | } | ||
139 | |||
140 | void Bu::XmlReader::error( const char *sMessage ) | ||
141 | { | ||
142 | throw Bu::XmlException("%d:%d: %s", | ||
143 | spCurToken.iLine, spCurToken.iChar, sMessage ); | ||
144 | } | ||
145 | |||
146 | void Bu::XmlReader::stDocument() | ||
147 | { | ||
148 | stProlog(); | ||
149 | } | ||
150 | |||
151 | void Bu::XmlReader::stProlog() | ||
152 | { | ||
153 | stXmlDecl(); | ||
154 | } | ||
155 | |||
156 | void Bu::XmlReader::stXmlDecl() | ||
157 | { | ||
158 | if( nextToken() != tokXmlDeclHead ) | ||
159 | error("You must begin your xml file with a declaration: <?xml ... ?>"); | ||
160 | if( nextToken() != tokIdent ) | ||
161 | error("A version comes first!"); | ||
162 | if( sStr != "version" ) | ||
163 | error("No, a version!"); | ||
164 | } | ||
165 | |||
diff --git a/src/xmlreader.h b/src/xmlreader.h new file mode 100644 index 0000000..375dfe3 --- /dev/null +++ b/src/xmlreader.h | |||
@@ -0,0 +1,54 @@ | |||
1 | #ifndef BU_XML_READER_H | ||
2 | #define BU_XML_READER_H | ||
3 | |||
4 | #include "bu/fstring.h" | ||
5 | |||
6 | namespace Bu | ||
7 | { | ||
8 | class Stream; | ||
9 | |||
10 | class XmlReader | ||
11 | { | ||
12 | public: | ||
13 | XmlReader( Stream &rInput ); | ||
14 | virtual ~XmlReader(); | ||
15 | |||
16 | private: | ||
17 | Stream &rInput; | ||
18 | int iCurToken; | ||
19 | int iNextToken; | ||
20 | Bu::FString sBuf; | ||
21 | Bu::FString sStr; | ||
22 | bool bIgnoreWS; | ||
23 | typedef struct StreamPos | ||
24 | { | ||
25 | StreamPos() : iLine( 1 ), iChar( 1 ) { } | ||
26 | int iLine; | ||
27 | int iChar; | ||
28 | } StreamPos; | ||
29 | StreamPos spCurToken; | ||
30 | StreamPos spNextToken; | ||
31 | |||
32 | |||
33 | enum | ||
34 | { | ||
35 | tokXmlDeclHead = 0x100, | ||
36 | tokXmlDeclEnd, | ||
37 | tokWS, | ||
38 | tokIdent, | ||
39 | tokString | ||
40 | }; | ||
41 | |||
42 | void fillBuffer(); | ||
43 | void cleanupBuffer( int iUsed ); | ||
44 | int nextToken(); | ||
45 | |||
46 | void stDocument(); | ||
47 | void stProlog(); | ||
48 | void stXmlDecl(); | ||
49 | |||
50 | void error( const char *sMessage ); | ||
51 | }; | ||
52 | }; | ||
53 | |||
54 | #endif | ||