diff options
Diffstat (limited to '')
-rw-r--r-- | misc/w3c-xml-1.1.html | 1598 | ||||
-rw-r--r-- | src/tests/xml.cpp | 15 | ||||
-rw-r--r-- | src/tsfdocument.cpp | 9 | ||||
-rw-r--r-- | src/tsfdocument.h | 22 | ||||
-rw-r--r-- | src/tsfnode.cpp | 9 | ||||
-rw-r--r-- | src/tsfnode.h | 21 | ||||
-rw-r--r-- | src/tsfreader.cpp | 9 | ||||
-rw-r--r-- | src/tsfreader.h | 22 | ||||
-rw-r--r-- | src/tsfwriter.cpp | 9 | ||||
-rw-r--r-- | src/tsfwriter.h | 22 | ||||
-rw-r--r-- | src/xmldocument.cpp | 9 | ||||
-rw-r--r-- | src/xmldocument.h | 22 | ||||
-rw-r--r-- | src/xmlnode.cpp | 9 | ||||
-rw-r--r-- | src/xmlnode.h | 22 | ||||
-rw-r--r-- | src/xmlreader.cpp | 108 | ||||
-rw-r--r-- | src/xmlreader.h | 70 | ||||
-rw-r--r-- | src/xmlwriter.cpp | 9 | ||||
-rw-r--r-- | src/xmlwriter.h | 22 |
18 files changed, 2007 insertions, 0 deletions
diff --git a/misc/w3c-xml-1.1.html b/misc/w3c-xml-1.1.html new file mode 100644 index 0000000..6a9211a --- /dev/null +++ b/misc/w3c-xml-1.1.html | |||
@@ -0,0 +1,1598 @@ | |||
1 | <?xml version="1.0" encoding="UTF-8"?><!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"><html lang="EN" xmlns="http://www.w3.org/1999/xhtml"><head><meta http-equiv="Content-Type" content="text/html;charset=UTF-8" /><title>Extensible Markup Language (XML) 1.1</title><style type="text/css"> | ||
2 | code { font-family: monospace; } | ||
3 | |||
4 | div.constraint, | ||
5 | div.issue, | ||
6 | div.note, | ||
7 | div.notice { margin-left: 2em; } | ||
8 | |||
9 | li p { margin-top: 0.3em; | ||
10 | margin-bottom: 0.3em; } | ||
11 | |||
12 | div.exampleInner pre { margin-left: 1em; | ||
13 | margin-top: 0em; margin-bottom: 0em} | ||
14 | div.exampleOuter {border: 4px double gray; | ||
15 | margin: 0em; padding: 0em} | ||
16 | div.exampleInner { background-color: #d5dee3; | ||
17 | border-top-width: 4px; | ||
18 | border-top-style: double; | ||
19 | border-top-color: #d3d3d3; | ||
20 | border-bottom-width: 4px; | ||
21 | border-bottom-style: double; | ||
22 | border-bottom-color: #d3d3d3; | ||
23 | padding: 4px; margin: 0em } | ||
24 | div.exampleWrapper { margin: 4px } | ||
25 | div.exampleHeader { font-weight: bold; | ||
26 | margin: 4px} | ||
27 | |||
28 | em.rfc2119 { text-transform: lowercase; | ||
29 | font-variant: small-caps; | ||
30 | font-style: normal; } | ||
31 | </style><link rel="stylesheet" type="text/css" href="http://www.w3.org/StyleSheets/TR/W3C-REC.css" /></head><body><div class="head"><p><a href="http://www.w3.org/"><img src="http://www.w3.org/Icons/w3c_home" alt="W3C" height="48" width="72" /></a></p> | ||
32 | <h1><a name="title" id="title" />Extensible Markup Language (XML) 1.1</h1> | ||
33 | <h2><a name="w3c-doctype" id="w3c-doctype" />W3C Recommendation 04 | ||
34 | February 2004, edited in place 15 April 2004</h2><dl><dt>This version:</dt><dd><a href="http://www.w3.org/TR/2004/REC-xml11-20040204/">http://www.w3.org/TR/2004/REC-xml11-20040204/</a></dd><dt>Latest version:</dt><dd><a href="http://www.w3.org/TR/xml11">http://www.w3.org/TR/xml11</a></dd><dt>Previous version:</dt><dd><a href="http://www.w3.org/TR/2003/PR-xml11-20031105/">http://www.w3.org/TR/2003/PR-xml11-20031105/</a></dd><dt>Editors:</dt><dd>Tim Bray, Textuality and Netscape <a href="mailto:tbray@textuality.com"><tbray@textuality.com></a></dd><dd>Jean Paoli, Microsoft <a href="mailto:jeanpa@microsoft.com"><jeanpa@microsoft.com></a></dd><dd>C. M. Sperberg-McQueen, W3C <a href="mailto:cmsmcq@w3.org"><cmsmcq@w3.org></a></dd><dd>Eve Maler, Sun Microsystems, Inc. <a href="mailto:elm@east.sun.com"><eve.maler@east.sun.com></a></dd><dd>François Yergeau <a href="mailto:fyergeau@alis.com"><fyergeau@alis.com></a></dd><dd>John Cowan <a href="mailto:cowan@ccil.org"><cowan@ccil.org></a></dd></dl><p>Please refer to the <a href="http://www.w3.org/XML/xml-V11-1e-errata"><strong>errata</strong></a> for this document, which may include some normative corrections.</p><p>This document is also available in these non-normative formats: <a href="REC-xml11-20040204.xml">XML</a> and <a href="REC-xml11-20040204-review.html">XHTML with color-coded revision indicators</a>.</p><p>See also <a href="http://www.w3.org/2003/03/Translations/byTechnology?technology=xml11"><strong>translations</strong></a>.</p><p class="copyright"><a href="http://www.w3.org/Consortium/Legal/ipr-notice#Copyright">Copyright</a> © 2004 <a href="http://www.w3.org/"><acronym title="World Wide Web Consortium">W3C</acronym></a><sup>®</sup> (<a href="http://www.csail.mit.edu/"><acronym title="Massachusetts Institute of Technology">MIT</acronym></a>, <a href="http://www.ercim.org/"><acronym title="European Research Consortium for Informatics and Mathematics">ERCIM</acronym></a>, <a href="http://www.keio.ac.jp/">Keio</a>), All Rights Reserved. W3C <a href="http://www.w3.org/Consortium/Legal/ipr-notice#Legal_Disclaimer">liability</a>, <a href="http://www.w3.org/Consortium/Legal/ipr-notice#W3C_Trademarks">trademark</a>, <a href="http://www.w3.org/Consortium/Legal/copyright-documents">document use</a> and <a href="http://www.w3.org/Consortium/Legal/copyright-software">software licensing</a> rules apply.</p></div><hr /><div> <h2><a name="abstract" id="abstract" />Abstract</h2><p>The Extensible Markup Language (XML) is a subset of SGML that is completely | ||
35 | described in this document. Its goal is to enable generic SGML to be served, | ||
36 | received, and processed on the Web in the way that is now possible with HTML. | ||
37 | XML has been designed for ease of implementation and for interoperability | ||
38 | with both SGML and HTML.</p></div><div> <h2><a name="status" id="status" />Status of this Document</h2><p><em>This section describes the status of this document at the time of its publication. Other documents may supersede this document. A list of current W3C publications and the latest revision of this technical report can be found in the <a href="http://www.w3.org/TR/">W3C technical reports index</a> at http://www.w3.org/TR/.</em></p><p>This document is a <a href="http://www.w3.org/2003/06/Process-20030618/tr.html#RecsW3C">Recommendation</a> of the W3C. | ||
39 | It has been reviewed by W3C Members and other interested parties, and has | ||
40 | been endorsed by the Director as a W3C Recommendation. It is a stable document and may be used as reference material or cited as a normative reference from another document. W3C's role in making the | ||
41 | Recommendation is to draw attention to the specification and to promote its widespread deployment. | ||
42 | This enhances the functionality and interoperability of the Web.</p><p>This document specifies a syntax created by subsetting an existing, widely | ||
43 | used international text processing standard (Standard Generalized Markup Language, | ||
44 | ISO 8879:1986(E) as amended and corrected) for use on the World Wide Web. | ||
45 | It is a product of the <a | ||
46 | href="http://www.w3.org/XML/Activity.html">W3C XML | ||
47 | Activity</a>.</p> | ||
48 | |||
49 | <p>On 15 April 2004, this document was edited in place to add two | ||
50 | missing spaces to <a | ||
51 | href="http://www.w3.org/TR/2004/REC-xml11-20040204/Overview.html#NT-document">production | ||
52 | [1]</a> in section 2.1</p> | ||
53 | |||
54 | <p>The English version of this specification is the only normative version. However, | ||
55 | for translations of this document, see <a href="http://www.w3.org/2003/03/Translations/byTechnology?technology=xml11">http://www.w3.org/2003/03/Translations/byTechnology?technology=xml11</a>. | ||
56 | </p><p>Documentation of intellectual property possibly relevant to this recommendation | ||
57 | may be found at the Working Group's public | ||
58 | <a href="http://www.w3.org/2002/08/xmlcore-IPR-statements">IPR disclosure page</a>.</p><p>An implementation report for XML 1.1 is available at <a href="http://www.w3.org/XML/2002/09/xml11-implementation.html">http://www.w3.org/XML/2002/09/xml11-implementation.html</a>.</p><p>Please report errors in this document to <a href="mailto:xml-editor@w3.org">xml-editor@w3.org</a>; <a href="http://lists.w3.org/Archives/Public/xml-editor">archives</a> are available. The errata list for this edition is available | ||
59 | at <a href="http://www.w3.org/XML/xml-V11-1e-errata">http://www.w3.org/XML/xml-V11-1e-errata</a>.</p><p>A <a href="http://www.w3.org/XML/Test/">Test Suite</a> is maintained to help assessing conformance to this specification.</p></div><div class="toc"> <h2><a name="contents" id="contents" />Table of Contents</h2><p class="toc">1 <a href="#sec-intro">Introduction</a><br /> 1.1 <a href="#sec-origin-goals">Origin and Goals</a><br /> 1.2 <a href="#sec-terminology">Terminology</a><br /> 1.3 <a href="#sec-xml11">Rationale and list of changes for XML 1.1</a><br /> 2 <a href="#sec-documents">Documents</a><br /> 2.1 <a href="#sec-well-formed">Well-Formed XML Documents</a><br /> 2.2 <a href="#charsets">Characters</a><br /> 2.3 <a href="#sec-common-syn">Common Syntactic Constructs</a><br /> 2.4 <a href="#syntax">Character Data and Markup</a><br /> 2.5 <a href="#sec-comments">Comments</a><br /> 2.6 <a href="#sec-pi">Processing Instructions</a><br /> 2.7 <a href="#sec-cdata-sect">CDATA Sections</a><br /> 2.8 <a href="#sec-prolog-dtd">Prolog and Document Type Declaration</a><br /> 2.9 <a href="#sec-rmd">Standalone Document Declaration</a><br /> 2.10 <a href="#sec-white-space">White Space Handling</a><br /> 2.11 <a href="#sec-line-ends">End-of-Line Handling</a><br /> 2.12 <a href="#sec-lang-tag">Language Identification</a><br /> 2.13 <a href="#sec-normalization-checking">Normalization Checking</a><br /> 3 <a href="#sec-logical-struct">Logical Structures</a><br /> 3.1 <a href="#sec-starttags">Start-Tags, End-Tags, and Empty-Element Tags</a><br /> 3.2 <a href="#elemdecls">Element Type Declarations</a><br /> 3.2.1 <a href="#sec-element-content">Element Content</a><br /> 3.2.2 <a href="#sec-mixed-content">Mixed Content</a><br /> 3.3 <a href="#attdecls">Attribute-List Declarations</a><br /> 3.3.1 <a href="#sec-attribute-types">Attribute Types</a><br /> 3.3.2 <a href="#sec-attr-defaults">Attribute Defaults</a><br /> 3.3.3 <a href="#AVNormalize">Attribute-Value Normalization</a><br /> 3.4 <a href="#sec-condition-sect">Conditional Sections</a><br /> 4 <a href="#sec-physical-struct">Physical Structures</a><br /> 4.1 <a href="#sec-references">Character and Entity References</a><br /> 4.2 <a href="#sec-entity-decl">Entity Declarations</a><br /> 4.2.1 <a href="#sec-internal-ent">Internal Entities</a><br /> 4.2.2 <a href="#sec-external-ent">External Entities</a><br /> 4.3 <a href="#TextEntities">Parsed Entities</a><br /> 4.3.1 <a href="#sec-TextDecl">The Text Declaration</a><br /> 4.3.2 <a href="#wf-entities">Well-Formed Parsed Entities</a><br /> 4.3.3 <a href="#charencoding">Character Encoding in Entities</a><br /> 4.3.4 <a href="#sec-version-info">Version Information in Entities</a><br /> 4.4 <a href="#entproc">XML Processor Treatment of Entities and References</a><br /> 4.4.1 <a href="#not-recognized">Not Recognized</a><br /> 4.4.2 <a href="#included">Included</a><br /> 4.4.3 <a href="#include-if-valid">Included If Validating</a><br /> 4.4.4 <a href="#forbidden">Forbidden</a><br /> 4.4.5 <a href="#inliteral">Included in Literal</a><br /> 4.4.6 <a href="#notify">Notify</a><br /> 4.4.7 <a href="#bypass">Bypassed</a><br /> 4.4.8 <a href="#as-PE">Included as PE</a><br /> 4.4.9 <a href="#error">Error</a><br /> 4.5 <a href="#intern-replacement">Construction of Entity Replacement Text</a><br /> 4.6 <a href="#sec-predefined-ent">Predefined Entities</a><br /> 4.7 <a href="#Notations">Notation Declarations</a><br /> 4.8 <a href="#sec-doc-entity">Document Entity</a><br /> 5 <a href="#sec-conformance">Conformance</a><br /> 5.1 <a href="#proc-types">Validating and Non-Validating Processors</a><br /> 5.2 <a href="#safe-behavior">Using XML Processors</a><br /> 6 <a href="#sec-notation">Notation</a><br /> </p> <h3><a name="appendices" id="appendices" />Appendices</h3><p class="toc">A <a href="#sec-bibliography">References</a><br /> A.1 <a href="#sec-existing-stds">Normative References</a><br /> A.2 <a href="#null">Other References</a><br /> B <a href="#sec-CharNorm">Definitions for Character Normalization</a><br /> C <a href="#sec-entexpand">Expansion of Entity and Character References</a> (Non-Normative)<br /> D <a href="#determinism">Deterministic Content Models</a> (Non-Normative)<br /> E <a href="#sec-guessing">Autodetection of Character Encodings</a> (Non-Normative)<br /> E.1 <a href="#sec-guessing-no-ext-info">Detection Without External Encoding Information</a><br /> E.2 <a href="#sec-guessing-with-ext-info">Priorities in the Presence of External Encoding Information</a><br /> F <a href="#sec-xml-wg">W3C XML Working Group</a> (Non-Normative)<br /> G <a href="#sec-core-wg">W3C XML Core Working Group</a> (Non-Normative)<br /> H <a href="#prod-notes">Production Notes</a> (Non-Normative)<br /> I <a href="#sec-suggested-names">Suggestions for XML Names</a> (Non-Normative)<br /> </p></div><hr /><div class="body"><div class="div1"> <h2><a name="sec-intro" id="sec-intro" />1 Introduction</h2><p>Extensible Markup Language, abbreviated XML, describes a class of data | ||
60 | objects called <a title="XML Document" href="#dt-xml-doc">XML documents</a> and partially | ||
61 | describes the behavior of computer programs which process them. XML is an | ||
62 | application profile or restricted form of SGML, the Standard Generalized Markup | ||
63 | Language <a href="#ISO8879">[ISO 8879]</a>. By construction, XML documents are conforming | ||
64 | SGML documents.</p><p>XML documents are made up of storage units called <a title="Entity" href="#dt-entity">entities</a>, | ||
65 | which contain either parsed or unparsed data. Parsed data is made up of <a title="Character" href="#dt-character">characters</a>, some of which form <a title="Character Data" href="#dt-chardata">character | ||
66 | data</a>, and some of which form <a title="Markup" href="#dt-markup">markup</a>. | ||
67 | Markup encodes a description of the document's storage layout and logical | ||
68 | structure. XML provides a mechanism to impose constraints on the storage layout | ||
69 | and logical structure.</p><p>[<a name="dt-xml-proc" id="dt-xml-proc" title="XML Processor">Definition</a>: A software module called | ||
70 | an <b>XML processor</b> is used to read XML documents and provide access | ||
71 | to their content and structure.] [<a name="dt-app" id="dt-app" title="Application">Definition</a>: It | ||
72 | is assumed that an XML processor is doing its work on behalf of another module, | ||
73 | called the <b>application</b>.] This specification describes | ||
74 | the required behavior of an XML processor in terms of how it must read XML | ||
75 | data and the information it must provide to the application.</p><div class="div2"> <h3><a name="sec-origin-goals" id="sec-origin-goals" />1.1 Origin and Goals</h3><p>XML was developed by an XML Working Group (originally known as the SGML | ||
76 | Editorial Review Board) formed under the auspices of the World Wide Web Consortium | ||
77 | (W3C) in 1996. It was chaired by Jon Bosak of Sun Microsystems with the active | ||
78 | participation of an XML Special Interest Group (previously known as the SGML | ||
79 | Working Group) also organized by the W3C. The membership of the XML Working | ||
80 | Group is given in an appendix. Dan Connolly served as the Working Group's contact with | ||
81 | the W3C.</p><p>The design goals for XML are:</p><ol type="1"><li><p>XML shall be straightforwardly usable over the Internet.</p></li><li><p>XML shall support a wide variety of applications.</p></li><li><p>XML shall be compatible with SGML.</p></li><li><p>It shall be easy to write programs which process XML documents.</p></li><li><p>The number of optional features in XML is to be kept to the absolute | ||
82 | minimum, ideally zero.</p></li><li><p>XML documents should be human-legible and reasonably clear.</p></li><li><p>The XML design should be prepared quickly.</p></li><li><p>The design of XML shall be formal and concise.</p></li><li><p>XML documents shall be easy to create.</p></li><li><p>Terseness in XML markup is of minimal importance.</p></li></ol><p>This specification, together with associated standards (Unicode | ||
83 | <a href="#Unicode">[Unicode]</a> and ISO/IEC 10646 <a href="#ISO10646">[ISO/IEC 10646]</a> | ||
84 | for characters, Internet RFC 3066 <a href="#RFC1766">[IETF RFC 3066]</a> for | ||
85 | language identification tags, ISO 639 <a href="#ISO639">[ISO 639]</a> | ||
86 | for language name codes, and ISO 3166 <a href="#ISO3166">[ISO 3166]</a> for | ||
87 | country name codes), provides all the information necessary to | ||
88 | understand XML Version 1.1 and construct computer | ||
89 | programs to process it.</p><p>This version of the XML specification may be distributed freely, as long as | ||
90 | all text and legal notices remain intact.</p></div><div class="div2"> <h3><a name="sec-terminology" id="sec-terminology" />1.2 Terminology</h3><p>The terminology used to describe XML documents is defined in the body of | ||
91 | this specification. <span class="mustard">The key words <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em>, <em class="rfc2119" title="Keyword in RFC 2119 context">MUST NOT</em>, <em class="rfc2119" title="Keyword in RFC 2119 context">REQUIRED</em>, <em class="rfc2119" title="Keyword in RFC 2119 context">SHALL</em>, <em class="rfc2119" title="Keyword in RFC 2119 context">SHALL NOT</em>, <em class="rfc2119" title="Keyword in RFC 2119 context">SHOULD</em>, <em class="rfc2119" title="Keyword in RFC 2119 context">SHOULD NOT</em>, <em class="rfc2119" title="Keyword in RFC 2119 context">RECOMMENDED</em>, <em class="rfc2119" title="Keyword in RFC 2119 context">MAY</em>, and <em class="rfc2119" title="Keyword in RFC 2119 context">OPTIONAL</em>, when <em class="rfc2119" title="Keyword in RFC 2119 context">EMPHASIZED</em>, are to be interpreted as described in <a href="#rfc2119">[IETF RFC 2119]</a>. In addition, </span>the terms defined in the following list are used in building | ||
92 | those definitions and in describing the actions of an XML processor:</p><dl><dt class="label">error</dt><dd><p>[<a name="dt-error" id="dt-error" title="Error">Definition</a>: A violation of the rules of this specification; | ||
93 | results are undefined. <span class="mustard">Unless otherwise specified, failure to observe a prescription of this specification indicated by one of the keywords <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em>, <em class="rfc2119" title="Keyword in RFC 2119 context">REQUIRED</em>, <em class="rfc2119" title="Keyword in RFC 2119 context">MUST NOT</em>, <em class="rfc2119" title="Keyword in RFC 2119 context">SHALL</em> and <em class="rfc2119" title="Keyword in RFC 2119 context">SHALL NOT</em> is an error.</span> Conforming software <em class="rfc2119" title="Keyword in RFC 2119 context">MAY</em> detect and report an error | ||
94 | and <em class="rfc2119" title="Keyword in RFC 2119 context">MAY</em> recover from it.]</p></dd><dt class="label">fatal error</dt><dd><p>[<a name="dt-fatal" id="dt-fatal" title="Fatal Error">Definition</a>: An error which a conforming <a title="XML Processor" href="#dt-xml-proc">XML processor</a> <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> detect and report to the application. | ||
95 | After encountering a fatal error, the processor <em class="rfc2119" title="Keyword in RFC 2119 context">MAY</em> continue processing the | ||
96 | data to search for further errors and <em class="rfc2119" title="Keyword in RFC 2119 context">MAY</em> report such errors to the application. | ||
97 | In order to support correction of errors, the processor <em class="rfc2119" title="Keyword in RFC 2119 context">MAY</em> make unprocessed | ||
98 | data from the document (with intermingled character data and markup) available | ||
99 | to the application. Once a fatal error is detected, however, the processor | ||
100 | <em class="rfc2119" title="Keyword in RFC 2119 context">MUST NOT</em> continue normal processing (i.e., it <em class="rfc2119" title="Keyword in RFC 2119 context">MUST NOT</em> continue to pass character | ||
101 | data and information about the document's logical structure to the application | ||
102 | in the normal way).]</p></dd><dt class="label">at user option</dt><dd><p>[<a name="dt-atuseroption" id="dt-atuseroption" title="At user option">Definition</a>: Conforming software | ||
103 | <em class="rfc2119" title="Keyword in RFC 2119 context">MAY</em> or <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> (depending on the modal verb in the sentence) behave as described; | ||
104 | if it does, it <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> provide users a means to enable or disable the behavior | ||
105 | described.]</p></dd><dt class="label">validity constraint</dt><dd><p>[<a name="dt-vc" id="dt-vc" title="Validity constraint">Definition</a>: A rule which applies to | ||
106 | all <a title="Validity" href="#dt-valid">valid</a> XML documents. Violations of validity | ||
107 | constraints are errors; they <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em>, at user option, be reported by <a title="Validating Processor" href="#dt-validating">validating XML processors</a>.]</p></dd><dt class="label">well-formedness constraint</dt><dd><p>[<a name="dt-wfc" id="dt-wfc" title="Well-formedness constraint">Definition</a>: A rule which applies | ||
108 | to all <a title="Well-Formed" href="#dt-wellformed">well-formed</a> XML documents. Violations | ||
109 | of well-formedness constraints are <a title="Fatal Error" href="#dt-fatal">fatal errors</a>.]</p></dd><dt class="label">match</dt><dd><p>[<a name="dt-match" id="dt-match" title="match">Definition</a>: (Of strings or names:) Two strings | ||
110 | or names being compared <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> be identical. Characters with multiple possible | ||
111 | representations in Unicode (e.g. characters with both precomposed and | ||
112 | base+diacritic forms) match only if they have the same representation in both | ||
113 | strings. No | ||
114 | case folding is performed. (Of strings and rules in the grammar:) A string | ||
115 | matches a grammatical production if it belongs to the language generated by | ||
116 | that production. (Of content and content models:) An element matches its declaration | ||
117 | when it conforms in the fashion described in the constraint <b>[VC: <a href="#elementvalid">Element Valid</a>]</b>.]</p></dd><dt class="label">for compatibility</dt><dd><p>[<a name="dt-compat" id="dt-compat" title="For Compatibility">Definition</a>: Marks | ||
118 | a sentence describing a feature of XML included solely to ensure | ||
119 | that XML remains compatible with SGML.]</p></dd><dt class="label">for interoperability</dt><dd><p>[<a name="dt-interop" id="dt-interop" title="For interoperability">Definition</a>: Marks | ||
120 | a sentence describing a non-binding recommendation included to increase | ||
121 | the chances that XML documents can be processed by the existing installed | ||
122 | base of SGML processors which predate the WebSGML Adaptations Annex to ISO 8879.]</p></dd></dl><p></p></div><div class="div2"> <h3><a name="sec-xml11" id="sec-xml11" />1.3 Rationale and list of changes for XML 1.1</h3><p>The W3C's XML 1.0 Recommendation was first issued in 1998, and | ||
123 | despite the issuance of many errata culminating in a Third Edition | ||
124 | of 2004, has remained (by intention) unchanged with respect to what | ||
125 | is well-formed XML and what is not. This stability has been | ||
126 | extremely useful for interoperability. However, the Unicode | ||
127 | Standard on which XML 1.0 relies for character specifications has | ||
128 | not remained static, evolving from version 2.0 to version 4.0 and | ||
129 | beyond. Characters not present in Unicode 2.0 may already be used | ||
130 | in XML 1.0 character data. However, they are not allowed in XML | ||
131 | names such as element type names, attribute names, enumerated | ||
132 | attribute values, processing instruction targets, and so on. In | ||
133 | addition, some characters that should have been permitted in XML | ||
134 | names were not, due to oversights and inconsistencies in Unicode | ||
135 | 2.0.</p><p>The overall philosophy of names has changed since XML 1.0. | ||
136 | Whereas XML 1.0 provided a rigid definition of names, wherein | ||
137 | everything that was not permitted was forbidden, XML 1.1 names are | ||
138 | designed so that everything that is not forbidden (for a specific | ||
139 | reason) is permitted. Since Unicode will continue to grow past | ||
140 | version 4.0, further changes to XML can be avoided by allowing | ||
141 | almost any character, including those not yet assigned, in | ||
142 | names.</p><p>In addition, XML 1.0 attempts to adapt to the line-end | ||
143 | conventions of various modern operating systems, but discriminates | ||
144 | against the conventions used on IBM and IBM-compatible mainframes. | ||
145 | As a result, XML documents on mainframes are not plain text files | ||
146 | according to the local conventions. XML 1.0 documents generated on | ||
147 | mainframes must either violate the local line-end conventions, or | ||
148 | employ otherwise unnecessary translation phases before parsing and | ||
149 | after generation. Allowing straightforward interoperability is | ||
150 | particularly important when data stores are shared between | ||
151 | mainframe and non-mainframe systems (as opposed to being copied | ||
152 | from one to the other). Therefore XML 1.1 adds NEL (#x85) to the | ||
153 | list of line-end characters. For completeness, the Unicode line | ||
154 | separator character, #x2028, is also supported. | ||
155 | </p><p>Finally, there is considerable demand to define a standard representation | ||
156 | of arbitrary Unicode characters in XML documents. Therefore, XML 1.1 | ||
157 | allows the use of character references to the control characters #x1 through | ||
158 | #x1F, most of which are forbidden in XML 1.0. For reasons of robustness, | ||
159 | however, these characters still cannot be used directly in documents. In | ||
160 | order to improve the robustness of character encoding detection, the additional | ||
161 | control characters #x7F through #x9F, which were freely allowed in XML 1.0 | ||
162 | documents, now must also appear only as character references. (Whitespace | ||
163 | characters are of course exempt.) The minor sacrifice of backward compatibility | ||
164 | is considered not significant. Due to potential problems with APIs, | ||
165 | #x0 is still forbidden both directly and as a character reference. | ||
166 | </p><p>Finally, XML 1.1 defines a set of constraints called "full | ||
167 | normalization" on XML documents, which document creators | ||
168 | <em class="rfc2119" title="Keyword in RFC 2119 context">SHOULD</em> adhere to, and document processors | ||
169 | <em class="rfc2119" title="Keyword in RFC 2119 context">SHOULD</em> verify. Using fully normalized documents | ||
170 | ensures that identity comparisons of names, attribute values, and | ||
171 | character content can be made correctly by simple binary comparison of | ||
172 | Unicode strings.</p><p>A new XML version, rather than a set of errata to XML 1.0, is | ||
173 | being created because the changes affect the definition of | ||
174 | well-formed documents. XML 1.0 processors must continue to reject | ||
175 | documents that contain new characters in XML names, new line-end | ||
176 | conventions, and references to control characters. The distinction between XML 1.0 and XML 1.1 documents | ||
177 | is indicated by the version number information in the XML | ||
178 | declaration at the start of each document. | ||
179 | </p></div></div><div class="div1"> <h2><a name="sec-documents" id="sec-documents" />2 Documents</h2><p>[<a name="dt-xml-doc" id="dt-xml-doc" title="XML Document">Definition</a>: A data object is an <b>XML | ||
180 | document</b> if it is <a title="Well-Formed" href="#dt-wellformed">well-formed</a>, | ||
181 | as defined in this specification. A well-formed XML document <em class="rfc2119" title="Keyword in RFC 2119 context">MAY</em> in addition | ||
182 | be <a title="Validity" href="#dt-valid">valid</a> if it meets certain further constraints.]</p><p>Each XML document has both a logical and a physical structure. Physically, | ||
183 | the document is composed of units called <a title="Entity" href="#dt-entity">entities</a>. | ||
184 | An entity <em class="rfc2119" title="Keyword in RFC 2119 context">MAY</em> <a title="Entity Reference" href="#dt-entref">refer</a> to other entities to | ||
185 | cause their inclusion in the document. A document begins in a "root" | ||
186 | or <a title="Document Entity" href="#dt-docent">document entity</a>. Logically, the document | ||
187 | is composed of declarations, elements, comments, character references, and | ||
188 | processing instructions, all of which are indicated in the document by explicit | ||
189 | markup. The logical and physical structures <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> nest properly, as described | ||
190 | in <a href="#wf-entities"><b>4.3.2 Well-Formed Parsed Entities</b></a>.</p><div class="div2"> <h3><a name="sec-well-formed" id="sec-well-formed" />2.1 Well-Formed XML Documents</h3><p>[<a name="dt-wellformed" id="dt-wellformed" title="Well-Formed">Definition</a>: A textual object is a <b>well-formed</b> | ||
191 | XML document if:]</p><ol type="1"><li><p>Taken as a whole, it matches the production labeled <a href="#NT-document">document</a>.</p></li><li><p>It meets all the well-formedness constraints given in this specification.</p></li><li><p>Each of the <a title="Text Entity" href="#dt-parsedent">parsed entities</a> | ||
192 | which is referenced directly or indirectly within the document is <a | ||
193 | title="Well-Formed" | ||
194 | href="#dt-wellformed">well-formed</a>.</p></li></ol> <h5><a | ||
195 | name="document" id="document" />Document</h5><table class="scrap" | ||
196 | summary="Scrap"><tbody><tr valign="baseline"><td><a name="NT-document" | ||
197 | id="NT-document" | ||
198 | />[1] </td><td><code>document</code></td><td> ::= </td><td><code><a | ||
199 | href="#NT-prolog">prolog</a> <a href="#NT-element">element</a> <a | ||
200 | href="#NT-Misc">Misc</a>* - <a href="#NT-Char">Char</a>* <a | ||
201 | href="#NT-RestrictedChar">RestrictedChar</a> <a href="#NT-Char">Char</a>*</code></td></tr></tbody></table><p>Matching the <a href="#NT-document">document</a> production implies that:</p><ol type="1"><li><p>It contains one or more <a title="Element" href="#dt-element">elements</a>.</p></li><li><p>[<a name="dt-root" id="dt-root" title="Root Element">Definition</a>: There is exactly one element, | ||
202 | called the <b>root</b>, or document element, no part of which appears | ||
203 | in the <a title="Content" href="#dt-content">content</a> of any other element.] For | ||
204 | all other elements, if the <a title="Start-Tag" href="#dt-stag">start-tag</a> is in | ||
205 | the content of another element, the <a title="End Tag" href="#dt-etag">end-tag</a> | ||
206 | is in the content of the same element. More simply stated, the elements, | ||
207 | delimited by start- and end-tags, nest properly within each other.</p></li></ol><p>[<a name="dt-parentchild" id="dt-parentchild" title="Parent/Child">Definition</a>: As a consequence of this, | ||
208 | for each non-root element <code>C</code> in the document, there is one other element <code>P</code> | ||
209 | in the document such that <code>C</code> is in the content of <code>P</code>, but | ||
210 | is not in the content of any other element that is in the content of <code>P</code>. <code>P</code> | ||
211 | is referred to as the <b>parent</b> of <code>C</code>, and <code>C</code> as | ||
212 | a <b>child</b> of <code>P</code>.]</p></div><div class="div2"> <h3><a name="charsets" id="charsets" />2.2 Characters</h3><p>[<a name="dt-text" id="dt-text" title="Text">Definition</a>: A parsed entity contains <b>text</b>, | ||
213 | a sequence of <a title="Character" href="#dt-character">characters</a>, which may | ||
214 | represent markup or character data.] [<a name="dt-character" id="dt-character" title="Character">Definition</a>: A <b>character</b> | ||
215 | is an atomic unit of text as specified by <span>ISO/IEC 10646 <a href="#ISO10646">[ISO/IEC 10646]</a></span>. Legal characters are tab, carriage | ||
216 | return, line feed, and the legal characters | ||
217 | of Unicode and ISO/IEC 10646. The | ||
218 | versions of these standards cited in <a href="#sec-existing-stds"><b>A.1 Normative References</b></a> were | ||
219 | current at the time this document was prepared. New characters may be added | ||
220 | to these standards by amendments or new editions. Consequently, XML processors | ||
221 | <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> accept any character in the range specified for <a href="#NT-Char">Char</a>.]</p> <h5><a name="char32" id="char32" />Character Range</h5><table class="scrap" summary="Scrap"><tbody><tr valign="baseline"><td><a name="NT-Char" id="NT-Char" />[2] </td><td><code>Char</code></td><td> ::= </td><td><code>[#x1-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]</code></td><td><i>/* any Unicode character, excluding the surrogate blocks, FFFE, and FFFF. */</i></td></tr><tr valign="baseline"><td><a name="NT-RestrictedChar" id="NT-RestrictedChar" />[2a] </td><td><code>RestrictedChar</code></td><td> ::= </td><td><code>[#x1-#x8] | [#xB-#xC] | [#xE-#x1F] | [#x7F-#x84] | [#x86-#x9F]</code></td></tr></tbody></table><p>The mechanism for encoding character code points into bit patterns <em class="rfc2119" title="Keyword in RFC 2119 context">MAY</em> | ||
222 | vary from entity to entity. All XML processors <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> accept the UTF-8 and UTF-16 | ||
223 | encodings of <span> Unicode | ||
224 | <a href="#Unicode">[Unicode]</a></span>; | ||
225 | the mechanisms for signaling which of the two is in use, | ||
226 | or for bringing other encodings into play, are discussed later, in <a href="#charencoding"><b>4.3.3 Character Encoding in Entities</b></a>.</p><div class="note"><p class="prefix"><b>Note:</b></p><p>Document authors are encouraged to avoid | ||
227 | "compatibility characters", as defined | ||
228 | in Unicode <a href="#Unicode">[Unicode]</a>. | ||
229 | The characters defined in the following ranges are also | ||
230 | discouraged. They are either control characters or permanently undefined Unicode | ||
231 | characters:</p><div class="exampleInner"><pre> | ||
232 | [#x7F-#x84], [#x86-#x9F], [#xFDD0-#xFDDF], | ||
233 | [#1FFFE-#x1FFFF], [#2FFFE-#x2FFFF], [#3FFFE-#x3FFFF], | ||
234 | [#4FFFE-#x4FFFF], [#5FFFE-#x5FFFF], [#6FFFE-#x6FFFF], | ||
235 | [#7FFFE-#x7FFFF], [#8FFFE-#x8FFFF], [#9FFFE-#x9FFFF], | ||
236 | [#AFFFE-#xAFFFF], [#BFFFE-#xBFFFF], [#CFFFE-#xCFFFF], | ||
237 | [#DFFFE-#xDFFFF], [#EFFFE-#xEFFFF], [#FFFFE-#xFFFFF], | ||
238 | [#10FFFE-#x10FFFF].</pre></div></div></div><div class="div2"> <h3><a name="sec-common-syn" id="sec-common-syn" />2.3 Common Syntactic Constructs</h3><p>This section defines some symbols used widely in the grammar.</p><p><a href="#NT-S">S</a> (white space) consists of one or more space (#x20) | ||
239 | characters, carriage returns, line feeds, or tabs.</p> <h5><a name="white" id="white" />White Space</h5><table class="scrap" summary="Scrap"><tbody><tr valign="baseline"><td><a name="NT-S" id="NT-S" />[3] </td><td><code>S</code></td><td> ::= </td><td><code>(#x20 | #x9 | #xD | #xA)+</code></td></tr></tbody></table><div class="note"><p class="prefix"><b>Note:</b></p><p>The presence of #xD in the above production is | ||
240 | maintained purely for backward compatibility with the | ||
241 | <a href="http://www.w3.org/TR/1998/REC-xml-19980210">First Edition</a>. | ||
242 | As explained in <a href="#sec-line-ends"><b>2.11 End-of-Line Handling</b></a>, | ||
243 | all #xD characters literally present in an XML document | ||
244 | are either removed or replaced by #xA characters before | ||
245 | any other processing is done. The only way to get a #xD character to match this production is to | ||
246 | use a character reference in an entity value literal.</p></div><p>[<a name="dt-name" id="dt-name" title="Name">Definition</a>: A <b>Name</b> is a token beginning | ||
247 | with a letter or one of a few punctuation characters, and continuing with | ||
248 | letters, digits, hyphens, underscores, colons, or full stops, together known | ||
249 | as name characters.] Names beginning with the string "<code>xml</code>", | ||
250 | or <span>with</span> any string which would match <code>(('X'|'x') ('M'|'m') ('L'|'l'))</code>, | ||
251 | are reserved for standardization in this or future versions of this specification.</p><div class="note"><p class="prefix"><b>Note:</b></p><p>The | ||
252 | Namespaces in XML Recommendation <a href="#xml-names">[XML Names]</a> assigns a meaning | ||
253 | to names containing colon characters. Therefore, authors should not use the | ||
254 | colon in XML names except for namespace purposes, but XML processors must | ||
255 | accept the colon as a name character.</p></div><p>An <a href="#NT-Nmtoken">Nmtoken</a> (name token) is any mixture of name | ||
256 | characters.</p><p>The first character of a Name <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> be a NameStartChar, and any | ||
257 | other characters <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> be NameChars; this mechanism is used to | ||
258 | prevent names from beginning with European (ASCII) digits or with | ||
259 | basic combining characters. Almost all characters are permitted in | ||
260 | names, except those which either are or reasonably could be used as | ||
261 | delimiters. The intention is to be inclusive rather than exclusive, | ||
262 | so that writing systems not yet encoded in Unicode can be used in | ||
263 | XML names. See <a href="#sec-suggested-names"><b>I Suggestions for XML Names</b></a> for suggestions on the creation of | ||
264 | names.</p><p>Document authors are encouraged to use names which are | ||
265 | meaningful words or combinations of words in natural languages, and | ||
266 | to avoid symbolic or white space characters in names. Note that | ||
267 | COLON, HYPHEN-MINUS, FULL STOP (period), LOW LINE (underscore), and | ||
268 | MIDDLE DOT are explicitly permitted.</p><p>The ASCII symbols and punctuation marks, along with a fairly | ||
269 | large group of Unicode symbol characters, are excluded from names | ||
270 | because they are more useful as delimiters in contexts where XML | ||
271 | names are used outside XML documents; providing this group gives | ||
272 | those contexts hard guarantees about what <em>cannot</em> be part of | ||
273 | an XML name. The character #x037E, GREEK QUESTION MARK, is excluded | ||
274 | because when normalized it becomes a semicolon, which could change | ||
275 | the meaning of entity references.</p> <h5><a name="IDABN1S" id="IDABN1S" />Names and Tokens</h5><table class="scrap" summary="Scrap"><tbody><tr valign="baseline"><td><a name="NT-NameStartChar" id="NT-NameStartChar" />[4] </td><td><code>NameStartChar</code></td><td> ::= </td><td><code>":" | [A-Z] | "_" | [a-z] | [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF] | [#x370-#x37D] | [#x37F-#x1FFF] | [#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF] | [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] | [#x10000-#xEFFFF]</code></td></tr></tbody><tbody><tr valign="baseline"><td><a name="NT-NameChar" id="NT-NameChar" />[4a] </td><td><code>NameChar</code></td><td> ::= </td><td><code><a href="#NT-NameStartChar">NameStartChar</a> | "-" | "." | [0-9] | #xB7 | [#x0300-#x036F] | [#x203F-#x2040]</code></td></tr></tbody><tbody><tr valign="baseline"><td><a name="NT-Name" id="NT-Name" />[5] </td><td><code>Name</code></td><td> ::= </td><td><code><a href="#NT-NameStartChar">NameStartChar</a> (<a href="#NT-NameChar">NameChar</a>)*</code></td></tr></tbody><tbody><tr valign="baseline"><td><a name="NT-Names" id="NT-Names" />[6] </td><td><code>Names</code></td><td> ::= </td><td><code><a href="#NT-Name">Name</a> (#x20 <a href="#NT-Name">Name</a>)*</code></td></tr></tbody><tbody><tr valign="baseline"><td><a name="NT-Nmtoken" id="NT-Nmtoken" />[7] </td><td><code>Nmtoken</code></td><td> ::= </td><td><code>(<a href="#NT-NameChar">NameChar</a>)+</code></td></tr></tbody><tbody><tr valign="baseline"><td><a name="NT-Nmtokens" id="NT-Nmtokens" />[8] </td><td><code>Nmtokens</code></td><td> ::= </td><td><code><a href="#NT-Nmtoken">Nmtoken</a> (#x20 <a href="#NT-Nmtoken">Nmtoken</a>)*</code></td></tr></tbody></table><div class="note"><p class="prefix"><b>Note:</b></p><p>The <a href="#NT-Names">Names</a> | ||
276 | and <a href="#NT-Nmtokens">Nmtokens</a> productions are used to define the validity | ||
277 | of tokenized attribute values after normalization (see <a href="#sec-attribute-types"><b>3.3.1 Attribute Types</b></a>).</p></div><p>Literal data is any quoted string not containing the quotation mark used | ||
278 | as a delimiter for that string. Literals are used for specifying the content | ||
279 | of internal entities (<a href="#NT-EntityValue">EntityValue</a>), the values | ||
280 | of attributes (<a href="#NT-AttValue">AttValue</a>), and external identifiers | ||
281 | (<a href="#NT-SystemLiteral">SystemLiteral</a>). Note that a <a href="#NT-SystemLiteral">SystemLiteral</a> | ||
282 | can be parsed without scanning for markup.</p> <h5><a name="IDAFR1S" id="IDAFR1S" />Literals</h5><table class="scrap" summary="Scrap"><tbody><tr valign="baseline"><td><a name="NT-EntityValue" id="NT-EntityValue" />[9] </td><td><code>EntityValue</code></td><td> ::= </td><td><code>'"' ([^%&"] | <a href="#NT-PEReference">PEReference</a> | ||
283 | | <a href="#NT-Reference">Reference</a>)* '"' </code></td></tr><tr valign="baseline"><td /><td /><td /><td><code>| "'" ([^%&'] | <a href="#NT-PEReference">PEReference</a> | <a href="#NT-Reference">Reference</a>)* "'"</code></td></tr></tbody><tbody><tr valign="baseline"><td><a name="NT-AttValue" id="NT-AttValue" />[10] </td><td><code>AttValue</code></td><td> ::= </td><td><code>'"' ([^<&"] | <a href="#NT-Reference">Reference</a>)* | ||
284 | '"' </code></td></tr><tr valign="baseline"><td /><td /><td /><td><code>| "'" ([^<&'] | <a href="#NT-Reference">Reference</a>)* | ||
285 | "'"</code></td></tr></tbody><tbody><tr valign="baseline"><td><a name="NT-SystemLiteral" id="NT-SystemLiteral" />[11] </td><td><code>SystemLiteral</code></td><td> ::= </td><td><code>('"' [^"]* '"') | ("'" [^']* "'") </code></td></tr></tbody><tbody><tr valign="baseline"><td><a name="NT-PubidLiteral" id="NT-PubidLiteral" />[12] </td><td><code>PubidLiteral</code></td><td> ::= </td><td><code>'"' <a href="#NT-PubidChar">PubidChar</a>* '"' | ||
286 | | "'" (<a href="#NT-PubidChar">PubidChar</a> - "'")* "'"</code></td></tr></tbody><tbody><tr valign="baseline"><td><a name="NT-PubidChar" id="NT-PubidChar" />[13] </td><td><code>PubidChar</code></td><td> ::= </td><td><code>#x20 | #xD | #xA | [a-zA-Z0-9] | [-'()+,./:=?;!*#@$_%]</code></td></tr></tbody></table><div class="note"><p class="prefix"><b>Note:</b></p><p>Although | ||
287 | the <a href="#NT-EntityValue">EntityValue</a> production allows the definition | ||
288 | of a general entity consisting of a single explicit <code><</code> in the literal | ||
289 | (e.g., <code><!ENTITY mylt "<"></code>), it is strongly advised to avoid | ||
290 | this practice since any reference to that entity will cause a well-formedness | ||
291 | error.</p></div></div><div class="div2"> <h3><a name="syntax" id="syntax" />2.4 Character Data and Markup</h3><p><a title="Text" href="#dt-text">Text</a> consists of intermingled <a title="Character Data" href="#dt-chardata">character data</a> and markup. [<a name="dt-markup" id="dt-markup" title="Markup">Definition</a>: <b>Markup</b> takes the form of <a title="Start-Tag" href="#dt-stag">start-tags</a>, <a title="End Tag" href="#dt-etag">end-tags</a>, <a title="Empty" href="#dt-empty">empty-element tags</a>, <a title="Entity Reference" href="#dt-entref">entity references</a>, <a title="Character Reference" href="#dt-charref">character | ||
292 | references</a>, <a title="Comment" href="#dt-comment">comments</a>, <a title="CDATA Section" href="#dt-cdsection">CDATA section</a> delimiters, <a title="Document Type Declaration" href="#dt-doctype">document | ||
293 | type declarations</a>, <a title="Processing instruction" href="#dt-pi">processing instructions</a>, <a href="#NT-XMLDecl">XML declarations</a>, <a href="#NT-TextDecl">text declarations</a>, | ||
294 | and any white space that is at the top level of the document entity (that | ||
295 | is, outside the document element and not inside any other markup).]</p><p>[<a name="dt-chardata" id="dt-chardata" title="Character Data">Definition</a>: All text that is not markup | ||
296 | constitutes the <b>character data</b> of the document.]</p><p>The ampersand character (&) and the left angle bracket (<) <span class="mustard"><em class="rfc2119" title="Keyword in RFC 2119 context">MUST NOT</em></span> appear | ||
297 | in their literal form<span class="mustard">, except</span> when used as markup delimiters, or | ||
298 | within a <a title="Comment" href="#dt-comment">comment</a>, a <a title="Processing instruction" href="#dt-pi">processing | ||
299 | instruction</a>, or a <a title="CDATA Section" href="#dt-cdsection">CDATA section</a>. | ||
300 | |||
301 | If they are needed elsewhere, they <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> be <a title="escape" href="#dt-escape">escaped</a> | ||
302 | using either <a title="Character Reference" href="#dt-charref">numeric character references</a> | ||
303 | or the strings "<code>&amp;</code>" and "<code>&lt;</code>" | ||
304 | respectively. The right angle bracket (>) <em class="rfc2119" title="Keyword in RFC 2119 context">MAY</em> be represented using the string "<code>&gt;</code>", | ||
305 | and <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em>, <a title="For Compatibility" href="#dt-compat">for compatibility</a>, be escaped | ||
306 | using <span>either</span> "<code>&gt;</code>" or a character reference when it | ||
307 | appears in the string "<code>]]></code>" in content, when | ||
308 | that string is not marking the end of a <a title="CDATA Section" href="#dt-cdsection">CDATA | ||
309 | section</a>.</p><p>In the content of elements, character data is any string of characters | ||
310 | which does not contain the start-delimiter of any markup or the | ||
311 | CDATA-section-close delimiter, | ||
312 | "<code>]]></code>". | ||
313 | In a CDATA section, | ||
314 | character data is any string of characters not including the CDATA-section-close | ||
315 | delimiter.</p><p>To allow attribute values to contain both single and double quotes, the | ||
316 | apostrophe or single-quote character (') <em class="rfc2119" title="Keyword in RFC 2119 context">MAY</em> be represented as "<code>&apos;</code>", | ||
317 | and the double-quote character (") as "<code>&quot;</code>".</p> <h5><a name="IDASZ1S" id="IDASZ1S" />Character Data</h5><table class="scrap" summary="Scrap"><tbody><tr valign="baseline"><td><a name="NT-CharData" id="NT-CharData" />[14] </td><td><code>CharData</code></td><td> ::= </td><td><code>[^<&]* - ([^<&]* ']]>' [^<&]*)</code></td></tr></tbody></table></div><div class="div2"> <h3><a name="sec-comments" id="sec-comments" />2.5 Comments</h3><p>[<a name="dt-comment" id="dt-comment" title="Comment">Definition</a>: <b>Comments</b> <em class="rfc2119" title="Keyword in RFC 2119 context">MAY</em> appear | ||
318 | anywhere in a document outside other <a title="Markup" href="#dt-markup">markup</a>; | ||
319 | in addition, they <em class="rfc2119" title="Keyword in RFC 2119 context">MAY</em> appear within the document type declaration at places | ||
320 | allowed by the grammar. They are not part of the document's <a title="Character Data" href="#dt-chardata">character | ||
321 | data</a>; an XML processor <em class="rfc2119" title="Keyword in RFC 2119 context">MAY</em>, but need not, make it possible for an | ||
322 | application to retrieve the text of comments. <a title="For Compatibility" href="#dt-compat">For | ||
323 | compatibility</a>, the string "<code>--</code>" (double-hyphen) | ||
324 | <em class="rfc2119" title="Keyword in RFC 2119 context">MUST NOT</em> occur within comments.] Parameter | ||
325 | entity references <span class="mustard"><em class="rfc2119" title="Keyword in RFC 2119 context">MUST NOT</em> be</span> recognized within comments.</p> <h5><a name="IDAL11S" id="IDAL11S" />Comments</h5><table class="scrap" summary="Scrap"><tbody><tr valign="baseline"><td><a name="NT-Comment" id="NT-Comment" />[15] </td><td><code>Comment</code></td><td> ::= </td><td><code>'<!--' ((<a href="#NT-Char">Char</a> - '-') | ('-' | ||
326 | (<a href="#NT-Char">Char</a> - '-')))* '-->'</code></td></tr></tbody></table><p>An example of a comment:</p><div class="exampleInner"><pre><!-- declarations for <head> & <body> --></pre></div><p>Note | ||
327 | that the grammar does not allow a comment ending in <code>---></code>. The | ||
328 | following example is <em>not</em> well-formed.</p><div class="exampleInner"><pre><!-- B+, B, or B---></pre></div></div><div class="div2"> <h3><a name="sec-pi" id="sec-pi" />2.6 Processing Instructions</h3><p>[<a name="dt-pi" id="dt-pi" title="Processing instruction">Definition</a>: <b>Processing instructions</b> | ||
329 | (PIs) allow documents to contain instructions for applications.]</p> <h5><a name="IDAD31S" id="IDAD31S" />Processing Instructions</h5><table class="scrap" summary="Scrap"><tbody><tr valign="baseline"><td><a name="NT-PI" id="NT-PI" />[16] </td><td><code>PI</code></td><td> ::= </td><td><code>'<?' <a href="#NT-PITarget">PITarget</a> (<a href="#NT-S">S</a> | ||
330 | (<a href="#NT-Char">Char</a>* - (<a href="#NT-Char">Char</a>* '?>' <a href="#NT-Char">Char</a>*)))? '?>'</code></td></tr></tbody><tbody><tr valign="baseline"><td><a name="NT-PITarget" id="NT-PITarget" />[17] </td><td><code>PITarget</code></td><td> ::= </td><td><code><a href="#NT-Name">Name</a> - (('X' | 'x') ('M' | | ||
331 | 'm') ('L' | 'l'))</code></td></tr></tbody></table><p>PIs are not part of the document's <a title="Character Data" href="#dt-chardata">character | ||
332 | data</a>, but <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> be passed through to the application. The PI begins | ||
333 | with a target (<a href="#NT-PITarget">PITarget</a>) used to identify the application | ||
334 | to which the instruction is directed. The target names "<code>XML</code>", "<code>xml</code>", | ||
335 | and so on are reserved for standardization in this or future versions of this | ||
336 | specification. The XML <a title="Notation" href="#dt-notation">Notation</a> mechanism | ||
337 | <em class="rfc2119" title="Keyword in RFC 2119 context">MAY</em> be used for formal declaration of PI targets. Parameter | ||
338 | entity references <span class="mustard"><em class="rfc2119" title="Keyword in RFC 2119 context">MUST NOT</em> be</span> recognized within processing instructions.</p></div><div class="div2"> <h3><a name="sec-cdata-sect" id="sec-cdata-sect" />2.7 CDATA Sections</h3><p>[<a name="dt-cdsection" id="dt-cdsection" title="CDATA Section">Definition</a>: <b>CDATA sections</b> <em class="rfc2119" title="Keyword in RFC 2119 context">MAY</em> occur anywhere character data may occur; they are used to escape blocks | ||
339 | of text containing characters which would otherwise be recognized as markup. | ||
340 | CDATA sections begin with the string "<code><![CDATA[</code>" | ||
341 | and end with the string "<code>]]></code>":]</p> <h5><a name="IDAOA2S" id="IDAOA2S" />CDATA Sections</h5><table class="scrap" summary="Scrap"><tbody><tr valign="baseline"><td><a name="NT-CDSect" id="NT-CDSect" />[18] </td><td><code>CDSect</code></td><td> ::= </td><td><code><a href="#NT-CDStart">CDStart</a> <a href="#NT-CData">CData</a> <a href="#NT-CDEnd">CDEnd</a></code></td></tr></tbody><tbody><tr valign="baseline"><td><a name="NT-CDStart" id="NT-CDStart" />[19] </td><td><code>CDStart</code></td><td> ::= </td><td><code>'<![CDATA['</code></td></tr></tbody><tbody><tr valign="baseline"><td><a name="NT-CData" id="NT-CData" />[20] </td><td><code>CData</code></td><td> ::= </td><td><code>(<a href="#NT-Char">Char</a>* - (<a href="#NT-Char">Char</a>* | ||
342 | ']]>' <a href="#NT-Char">Char</a>*)) </code></td></tr></tbody><tbody><tr valign="baseline"><td><a name="NT-CDEnd" id="NT-CDEnd" />[21] </td><td><code>CDEnd</code></td><td> ::= </td><td><code>']]>'</code></td></tr></tbody></table><p>Within a CDATA section, only the <a href="#NT-CDEnd">CDEnd</a> string is | ||
343 | recognized as markup, so that left angle brackets and ampersands may occur | ||
344 | in their literal form; they need not (and cannot) be escaped using "<code>&lt;</code>" | ||
345 | and "<code>&amp;</code>". CDATA sections cannot nest.</p><p>An example of a CDATA section, in which "<code><greeting></code>" | ||
346 | and "<code></greeting></code>" are recognized as <a title="Character Data" href="#dt-chardata">character data</a>, not <a title="Markup" href="#dt-markup">markup</a>:</p><div class="exampleInner"><pre><![CDATA[<greeting>Hello, world!</greeting>]]> </pre></div></div><div class="div2"> <h3><a name="sec-prolog-dtd" id="sec-prolog-dtd" />2.8 Prolog and Document Type Declaration</h3><p>[<a name="dt-xmldecl" id="dt-xmldecl" title="XML Declaration">Definition</a>: XML 1.1 documents <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> | ||
347 | begin with an <b>XML declaration</b> which specifies the version of | ||
348 | XML being used.] For example, the following is a complete XML 1.1 document, <a title="Well-Formed" href="#dt-wellformed">well-formed</a> but not <a title="Validity" href="#dt-valid">valid</a>:</p><div class="exampleInner"><pre><?xml version="1.1"?> | ||
349 | <greeting>Hello, world!</greeting> </pre></div><p>but the following is an XML 1.0 document because it | ||
350 | does not have an XML declaration:</p><div class="exampleInner"><pre><greeting>Hello, world!</greeting></pre></div><p>The function of the markup in an XML document is to describe its storage and | ||
351 | logical structure and to associate <span>attribute | ||
352 | name-value</span> pairs with its logical structures. XML provides a mechanism, the | ||
353 | <a title="Document Type Declaration" href="#dt-doctype">document | ||
354 | type declaration</a>, to define constraints on the logical structure | ||
355 | and to support the use of predefined storage units. [<a name="dt-valid" id="dt-valid" title="Validity">Definition</a>: An XML document is <b>valid</b> if it has an associated | ||
356 | document type declaration and if the document complies with the constraints | ||
357 | expressed in it.]</p><p>The document type declaration <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> appear before the first <a title="Element" href="#dt-element">element</a> | ||
358 | in the document.</p> <h5><a name="xmldoc" id="xmldoc" />Prolog</h5><table class="scrap" summary="Scrap"><tbody><tr valign="baseline"><td><a name="NT-prolog" id="NT-prolog" />[22] </td><td><code>prolog</code></td><td> ::= </td><td><code><a href="#NT-XMLDecl">XMLDecl</a> <a href="#NT-Misc">Misc</a>* | ||
359 | (<a href="#NT-doctypedecl">doctypedecl</a> <a href="#NT-Misc">Misc</a>*)?</code></td></tr><tr valign="baseline"><td><a name="NT-XMLDecl" id="NT-XMLDecl" />[23] </td><td><code>XMLDecl</code></td><td> ::= </td><td><code>'<?xml' <a href="#NT-VersionInfo">VersionInfo</a> <a href="#NT-EncodingDecl">EncodingDecl</a>? <a href="#NT-SDDecl">SDDecl</a>? <a href="#NT-S">S</a>?'?>'</code></td></tr><tr valign="baseline"><td><a name="NT-VersionInfo" id="NT-VersionInfo" />[24] </td><td><code>VersionInfo</code></td><td> ::= </td><td><code><a href="#NT-S">S</a> 'version' <a href="#NT-Eq">Eq</a> | ||
360 | ("'" <a href="#NT-VersionNum">VersionNum</a> "'" | '"' <a href="#NT-VersionNum">VersionNum</a> | ||
361 | '"')</code></td></tr><tr valign="baseline"><td><a name="NT-Eq" id="NT-Eq" />[25] </td><td><code>Eq</code></td><td> ::= </td><td><code><a href="#NT-S">S</a>? '=' <a href="#NT-S">S</a>?</code></td></tr><tr valign="baseline"><td><a name="NT-VersionNum" id="NT-VersionNum" />[26] </td><td><code>VersionNum</code></td><td> ::= </td><td><code>'1.1'</code></td></tr><tr valign="baseline"><td><a name="NT-Misc" id="NT-Misc" />[27] </td><td><code>Misc</code></td><td> ::= </td><td><code><a href="#NT-Comment">Comment</a> | <a href="#NT-PI">PI</a> | ||
362 | | <a href="#NT-S">S</a></code></td></tr></tbody></table><p>[<a name="dt-doctype" id="dt-doctype" title="Document Type Declaration">Definition</a>: The XML <b>document | ||
363 | type declaration</b> contains or points to <a title="markup declaration" href="#dt-markupdecl">markup | ||
364 | declarations</a> that provide a grammar for a class of documents. This | ||
365 | grammar is known as a document type definition, or <b>DTD</b>. The document | ||
366 | type declaration can point to an external subset (a special kind of <a title="External Entity" href="#dt-extent">external entity</a>) containing markup declarations, | ||
367 | or can contain the markup declarations directly in an internal subset, or | ||
368 | can do both. The DTD for a document consists of both subsets taken together.]</p><p>[<a name="dt-markupdecl" id="dt-markupdecl" title="markup declaration">Definition</a>: A <b>markup declaration</b> | ||
369 | is an <a title="Element Type declaration" href="#dt-eldecl">element type declaration</a>, an <a title="Attribute-List Declaration" href="#dt-attdecl">attribute-list declaration</a>, an <a title="entity declaration" href="#dt-entdecl">entity | ||
370 | declaration</a>, or a <a title="Notation Declaration" href="#dt-notdecl">notation declaration</a>.] | ||
371 | These declarations <em class="rfc2119" title="Keyword in RFC 2119 context">MAY</em> be contained in whole or in part within <a title="Parameter entity" href="#dt-PE">parameter | ||
372 | entities</a>, as described in the well-formedness and validity constraints | ||
373 | below. For further | ||
374 | information, see <a href="#sec-physical-struct"><b>4 Physical Structures</b></a>.</p> <h5><a name="dtd" id="dtd" />Document Type Definition</h5><table class="scrap" summary="Scrap"><tbody><tr valign="baseline"><td><a name="NT-doctypedecl" id="NT-doctypedecl" />[28] </td><td><code>doctypedecl</code></td><td> ::= </td><td><code>'<!DOCTYPE' <a href="#NT-S">S</a> <a href="#NT-Name">Name</a> | ||
375 | (<a href="#NT-S">S</a> <a href="#NT-ExternalID">ExternalID</a>)? <a href="#NT-S">S</a>? | ||
376 | ('[' <a href="#NT-intSubset">intSubset</a> ']' <a href="#NT-S">S</a>?)? '>'</code></td><td><a href="#vc-roottype">[VC: Root Element Type]</a></td></tr><tr valign="baseline"><td /><td /><td /><td /><td><a href="#ExtSubset">[WFC: External Subset]</a></td></tr><tr valign="baseline"><td><a name="NT-DeclSep" id="NT-DeclSep" />[28a] </td><td><code>DeclSep</code></td><td> ::= </td><td><code><a href="#NT-PEReference">PEReference</a> | <a href="#NT-S">S</a></code></td><td><a href="#PE-between-Decls">[WFC: PE Between Declarations]</a></td></tr><tr valign="baseline"><td><a name="NT-intSubset" id="NT-intSubset" />[28b] </td><td><code>intSubset</code></td><td> ::= </td><td><code>(<a href="#NT-markupdecl">markupdecl</a> | <a href="#NT-DeclSep">DeclSep</a>)*</code></td></tr><tr valign="baseline"><td><a name="NT-markupdecl" id="NT-markupdecl" />[29] </td><td><code>markupdecl</code></td><td> ::= </td><td><code><a href="#NT-elementdecl">elementdecl</a> | <a href="#NT-AttlistDecl">AttlistDecl</a> | <a href="#NT-EntityDecl">EntityDecl</a> | ||
377 | | <a href="#NT-NotationDecl">NotationDecl</a> | <a href="#NT-PI">PI</a> | <a href="#NT-Comment">Comment</a></code></td><td><a href="#vc-PEinMarkupDecl">[VC: Proper Declaration/PE Nesting]</a></td></tr><tr valign="baseline"><td /><td /><td /><td /><td><a href="#wfc-PEinInternalSubset">[WFC: PEs in Internal Subset]</a></td></tr></tbody></table><p>Note | ||
378 | that it is possible to construct a well-formed document containing a <a href="#NT-doctypedecl">doctypedecl</a> | ||
379 | that neither points to an external subset nor contains an internal subset.</p><p>The markup declarations <em class="rfc2119" title="Keyword in RFC 2119 context">MAY</em> be made up in whole or in part of the <a title="Replacement Text" href="#dt-repltext">replacement text</a> of <a title="Parameter entity" href="#dt-PE">parameter | ||
380 | entities</a>. The productions later in this specification for individual | ||
381 | nonterminals (<a href="#NT-elementdecl">elementdecl</a>, <a href="#NT-AttlistDecl">AttlistDecl</a>, | ||
382 | and so on) describe the declarations <em>after</em> all the parameter | ||
383 | entities have been <a title="Include" href="#dt-include">included</a>.</p><p>Parameter | ||
384 | entity references are recognized anywhere in the DTD (internal and external | ||
385 | subsets and external parameter entities), except in literals, processing instructions, | ||
386 | comments, and the contents of ignored conditional sections (see <a href="#sec-condition-sect"><b>3.4 Conditional Sections</b></a>). | ||
387 | They are also recognized in entity value literals. The use of parameter entities | ||
388 | in the internal subset is restricted as described below.</p><div class="constraint"><p class="prefix"><a name="vc-roottype" id="vc-roottype" /><b>Validity constraint: Root Element Type</b></p><p>The <a href="#NT-Name">Name</a> | ||
389 | in the document type declaration <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> match the element type of the <a title="Root Element" href="#dt-root">root element</a>.</p></div><div class="constraint"><p class="prefix"><a name="vc-PEinMarkupDecl" id="vc-PEinMarkupDecl" /><b>Validity constraint: Proper Declaration/PE Nesting</b></p><p>Parameter-entity <a title="Replacement Text" href="#dt-repltext">replacement text</a> <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> be properly nested with markup declarations. That is to say, if either | ||
390 | the first character or the last character of a markup declaration (<a href="#NT-markupdecl">markupdecl</a> | ||
391 | above) is contained in the replacement text for a <a title="Parameter-entity reference" href="#dt-PERef">parameter-entity | ||
392 | reference</a>, both <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> be contained in the same replacement text.</p></div><div class="constraint"><p class="prefix"><a name="wfc-PEinInternalSubset" id="wfc-PEinInternalSubset" /><b>Well-formedness constraint: PEs in Internal Subset</b></p><p>In | ||
393 | the internal DTD subset, <a title="Parameter-entity reference" href="#dt-PERef">parameter-entity references</a> <span class="mustard"><em class="rfc2119" title="Keyword in RFC 2119 context">MUST NOT</em> occur within markup declarations; they <em class="rfc2119" title="Keyword in RFC 2119 context">MAY</em> occur where markup declarations can occur</span>. | ||
394 | (This does not apply to references that occur in external parameter entities | ||
395 | or to the external subset.)</p></div><div class="constraint"><p class="prefix"><a name="ExtSubset" id="ExtSubset" /><b>Well-formedness constraint: External Subset</b></p><p>The external subset, if any, <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> match the production for <a href="#NT-extSubset">extSubset</a>.</p></div><div class="constraint"><p class="prefix"><a name="PE-between-Decls" id="PE-between-Decls" /><b>Well-formedness constraint: PE Between Declarations</b></p><p>The replacement text of a parameter entity reference | ||
396 | in a <a href="#NT-DeclSep">DeclSep</a> <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> match the production <a href="#NT-extSubsetDecl">extSubsetDecl</a>.</p></div><p>Like the internal subset, the external subset and any external parameter | ||
397 | entities referenced | ||
398 | in a <a href="#NT-DeclSep">DeclSep</a> <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> consist of a series of | ||
399 | complete markup declarations of the types allowed by the non-terminal symbol <a href="#NT-markupdecl">markupdecl</a>, interspersed with white space or <a title="Parameter-entity reference" href="#dt-PERef">parameter-entity references</a>. However, portions of | ||
400 | the contents of the external subset or of these | ||
401 | external parameter entities <em class="rfc2119" title="Keyword in RFC 2119 context">MAY</em> conditionally be ignored by using the <a title="conditional section" href="#dt-cond-section">conditional section</a> construct; this is not | ||
402 | allowed in the internal subset<span> but is | ||
403 | allowed in external parameter entities referenced in the internal subset</span>.</p> <h5><a name="ext-Subset" id="ext-Subset" />External Subset</h5><table class="scrap" summary="Scrap"><tbody><tr valign="baseline"><td><a name="NT-extSubset" id="NT-extSubset" />[30] </td><td><code>extSubset</code></td><td> ::= </td><td><code><a href="#NT-TextDecl">TextDecl</a>? <a href="#NT-extSubsetDecl">extSubsetDecl</a></code></td></tr><tr valign="baseline"><td><a name="NT-extSubsetDecl" id="NT-extSubsetDecl" />[31] </td><td><code>extSubsetDecl</code></td><td> ::= </td><td><code>( <a href="#NT-markupdecl">markupdecl</a> | <a href="#NT-conditionalSect">conditionalSect</a> | <a href="#NT-DeclSep">DeclSep</a>)*</code></td></tr></tbody></table><p>The external subset and external parameter entities also differ from the | ||
404 | internal subset in that in them, <a title="Parameter-entity reference" href="#dt-PERef">parameter-entity | ||
405 | references</a> are permitted <em>within</em> markup declarations, | ||
406 | not only <em>between</em> markup declarations.</p><p>An example of an XML document with a document type declaration:</p><div class="exampleInner"><pre><?xml version="1.1"?> | ||
407 | <!DOCTYPE greeting SYSTEM "hello.dtd"> | ||
408 | <greeting>Hello, world!</greeting> </pre></div><p>The <a title="System Identifier" href="#dt-sysid">system identifier</a> "<code>hello.dtd</code>" | ||
409 | gives the address (a URI reference) of a DTD for the document.</p><p>The declarations can also be given locally, as in this example:</p><div class="exampleInner"><pre><?xml version="1.1" encoding="UTF-8" ?> | ||
410 | <!DOCTYPE greeting [ | ||
411 | <!ELEMENT greeting (#PCDATA)> | ||
412 | ]> | ||
413 | <greeting>Hello, world!</greeting></pre></div><p>If both the external and internal subsets are used, the internal subset | ||
414 | <span class="mustard"><em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> be</span> considered to occur before the external subset. | ||
415 | This has the effect that entity and attribute-list declarations in the internal | ||
416 | subset take precedence over those in the external subset.</p><p>XML 1.1 processors <em class="rfc2119" title="Keyword in RFC 2119 context">SHOULD</em> accept XML 1.0 | ||
417 | documents as well. If a document is well-formed or valid XML 1.0, and provided it | ||
418 | does not contain any control characters | ||
419 | in the range [#x7F-#x9F] other than as character escapes, it may be | ||
420 | made well-formed or valid XML 1.1 respectively simply by changing the | ||
421 | version number.</p></div><div class="div2"> <h3><a name="sec-rmd" id="sec-rmd" />2.9 Standalone Document Declaration</h3><p>Markup declarations can affect the content of the document, as passed from | ||
422 | an <a title="XML Processor" href="#dt-xml-proc">XML processor</a> to an application; examples | ||
423 | are attribute defaults and entity declarations. The standalone document declaration, | ||
424 | which <em class="rfc2119" title="Keyword in RFC 2119 context">MAY</em> appear as a component of the XML declaration, signals whether or | ||
425 | not there are such declarations which appear external to the <a title="Document Entity" href="#dt-docent">document | ||
426 | entity</a> | ||
427 | or in parameter entities. [<a name="dt-extmkpdecl" id="dt-extmkpdecl" title="External Markup Declaration">Definition</a>: An <b>external | ||
428 | markup declaration</b> is defined as a markup declaration occurring in | ||
429 | the external subset or in a parameter entity (external or internal, the latter | ||
430 | being included because non-validating processors are not required to read | ||
431 | them).]</p> <h5><a name="fulldtd" id="fulldtd" />Standalone Document Declaration</h5><table class="scrap" summary="Scrap"><tbody><tr valign="baseline"><td><a name="NT-SDDecl" id="NT-SDDecl" />[32] </td><td><code>SDDecl</code></td><td> ::= </td><td><code>#x20+ 'standalone' <a href="#NT-Eq">Eq</a> | ||
432 | (("'" ('yes' | 'no') "'") | ('"' ('yes' | 'no') '"')) </code></td><td><a href="#vc-check-rmd">[VC: Standalone Document Declaration]</a></td></tr></tbody></table><p>In a standalone document declaration, the value "yes" indicates | ||
433 | that there are no <a title="External Markup Declaration" href="#dt-extmkpdecl">external markup declarations</a> which | ||
434 | affect the information passed from the XML processor to the application. The | ||
435 | value "no" indicates that there are or may be such external | ||
436 | markup declarations. Note that the standalone document declaration only denotes | ||
437 | the presence of external <em>declarations</em>; the presence, in a document, | ||
438 | of references to external <em>entities</em>, when those entities are internally | ||
439 | declared, does not change its standalone status.</p><p>If there are no external markup declarations, the standalone document declaration | ||
440 | has no meaning. If there are external markup declarations but there is no | ||
441 | standalone document declaration, the value "no" is assumed.</p><p>Any XML document for which <code>standalone="no"</code> holds can be converted | ||
442 | algorithmically to a standalone document, which may be desirable for some | ||
443 | network delivery applications.</p><div class="constraint"><p class="prefix"><a name="vc-check-rmd" id="vc-check-rmd" /><b>Validity constraint: Standalone Document Declaration</b></p><p>The | ||
444 | standalone document declaration <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> have the value "no" if | ||
445 | any external markup declarations contain declarations of:</p><ul><li><p>attributes with <a title="Attribute Default" href="#dt-default">default</a> values, | ||
446 | if elements to which these attributes apply appear in the document without | ||
447 | specifications of values for these attributes, or</p></li><li><p>entities (other than <code>amp</code>, | ||
448 | <code>lt</code>, | ||
449 | <code>gt</code>, | ||
450 | <code>apos</code>, | ||
451 | <code>quot</code>), if <a title="Entity Reference" href="#dt-entref">references</a> | ||
452 | to those entities appear in the document, or</p></li><li><p>attributes with | ||
453 | tokenized types, where the | ||
454 | attribute appears in the document with a value such that | ||
455 | <a href="#AVNormalize"><cite>normalization</cite></a> | ||
456 | will produce a different value from that which would be produced | ||
457 | in the absence of the declaration, or</p></li><li><p>element types with <a title="Element content" href="#dt-elemcontent">element content</a>, | ||
458 | if white space occurs directly within any instance of those types.</p></li></ul></div><p>An example XML declaration with a standalone document declaration:</p><div class="exampleInner"><pre><?xml version="1.1" standalone='yes'?></pre></div></div><div class="div2"> <h3><a name="sec-white-space" id="sec-white-space" />2.10 White Space Handling</h3><p>In editing XML documents, it is often convenient to use "white space" | ||
459 | (spaces, tabs, and blank lines) | ||
460 | to set apart the markup for greater readability. Such white space is typically | ||
461 | not intended for inclusion in the delivered version of the document. On the | ||
462 | other hand, "significant" white space that should be preserved | ||
463 | in the delivered version is common, for example in poetry and source code.</p><p>An <a title="XML Processor" href="#dt-xml-proc">XML processor</a> <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> always pass | ||
464 | all characters in a document that are not markup through to the application. | ||
465 | A <a title="Validating Processor" href="#dt-validating"> validating XML processor</a> <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> also | ||
466 | inform the application which of these characters constitute white space appearing | ||
467 | in <a title="Element content" href="#dt-elemcontent">element content</a>.</p><p>A special <a title="Attribute" href="#dt-attr">attribute</a> named <code>xml:space</code> <em class="rfc2119" title="Keyword in RFC 2119 context">MAY</em> be attached to an element to signal an intention that in that element, | ||
468 | white space should be preserved by applications. In valid documents, this | ||
469 | attribute, like any other, <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> be <a title="Attribute-List Declaration" href="#dt-attdecl">declared</a> | ||
470 | if it is used. When declared, it <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> be given as an <a title="Enumerated Attribute
Values" href="#dt-enumerated">enumerated | ||
471 | type</a> whose values | ||
472 | are one or both of "default" and "preserve". | ||
473 | For example:</p><div class="exampleInner"><pre><!ATTLIST poem xml:space (default|preserve) 'preserve'> | ||
474 | <!ATTLIST pre xml:space (preserve) #FIXED 'preserve'></pre></div><p>The value "default" signals that applications' default white-space | ||
475 | processing modes are acceptable for this element; the value "preserve" | ||
476 | indicates the intent that applications preserve all the white space. This | ||
477 | declared intent is considered to apply to all elements within the content | ||
478 | of the element where it is specified, unless <span>overridden</span> with | ||
479 | another instance of the <code>xml:space</code> attribute. <span>This specification does not give meaning to any value of <code>xml:space</code> other than "default" and "preserve". It is an error for other values to be specified; the XML processor <em class="rfc2119" title="Keyword in RFC 2119 context">MAY</em> report the error or <em class="rfc2119" title="Keyword in RFC 2119 context">MAY</em> recover by ignoring the attribute specification or by reporting the (erroneous) value to the application. Applications may ignore or reject erroneous values.</span></p><p>The <a title="Root Element" href="#dt-root">root element</a> of any document is considered | ||
480 | to have signaled no intentions as regards application space handling, unless | ||
481 | it provides a value for this attribute or the attribute is declared with a | ||
482 | default value.</p></div><div class="div2"> <h3><a name="sec-line-ends" id="sec-line-ends" />2.11 End-of-Line Handling</h3><p>XML <a title="Text Entity" href="#dt-parsedent">parsed entities</a> are often stored | ||
483 | in computer files which, for editing convenience, are organized into lines. | ||
484 | These lines are typically separated by some combination of the characters | ||
485 | CARRIAGE RETURN (#xD) and LINE FEED (#xA).</p><p>To | ||
486 | simplify the tasks of <a title="Application" href="#dt-app">applications</a>, the | ||
487 | <span><a title="XML Processor" href="#dt-xml-proc">XML | ||
488 | processor</a> <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> behave as if it</span> normalized all line breaks in external parsed | ||
489 | entities (including the document entity) on input, before parsing, by translating | ||
490 | |||
491 | <span>all of the following to a single #xA character:</span></p><ol type="1"><li><p>the two-character sequence #xD #xA</p></li><li><p>the two-character sequence #xD #x85</p></li><li><p>the single character #x85</p></li><li><p>the single character #x2028</p></li><li><p>any #xD character that is not immediately followed by #xA or #x85.</p></li></ol><p> The characters #x85 and #x2028 cannot be reliably recognized and | ||
492 | translated until an entity's encoding declaration (if present) has | ||
493 | been read. Therefore, it is a fatal error to use them within the XML | ||
494 | declaration or text declaration. | ||
495 | </p></div><div class="div2"> <h3><a name="sec-lang-tag" id="sec-lang-tag" />2.12 Language Identification</h3><p>In document processing, it is often useful to identify the natural or formal | ||
496 | language in which the content is written. A special <a title="Attribute" href="#dt-attr">attribute</a> | ||
497 | named <code>xml:lang</code> <em class="rfc2119" title="Keyword in RFC 2119 context">MAY</em> be inserted in documents to specify the language | ||
498 | used in the contents and attribute values of any element in an XML document. | ||
499 | In valid documents, this attribute, like any other, <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> be <a title="Attribute-List Declaration" href="#dt-attdecl">declared</a> | ||
500 | if it is used. The | ||
501 | values of the attribute are language identifiers as defined by <a href="#RFC1766">[IETF RFC 3066]</a>, <cite>Tags | ||
502 | for the Identification of Languages</cite>, or its successor<span>; in addition, the empty string <em class="rfc2119" title="Keyword in RFC 2119 context">MAY</em> be specified</span>.</p><p>(Productions 33 through 38 have been removed.)</p><p>For example:</p><div class="exampleInner"><pre><p xml:lang="en">The quick brown fox jumps over the lazy dog.</p> | ||
503 | <p xml:lang="en-GB">What colour is it?</p> | ||
504 | <p xml:lang="en-US">What color is it?</p> | ||
505 | <sp who="Faust" desc='leise' xml:lang="de"> | ||
506 | <l>Habe nun, ach! Philosophie,</l> | ||
507 | <l>Juristerei, und Medizin</l> | ||
508 | <l>und leider auch Theologie</l> | ||
509 | <l>durchaus studiert mit hei&#xDF;em Bem&#xFC;h'n.</l> | ||
510 | </sp></pre></div><p>The intent declared with <code>xml:lang</code> is considered to apply to | ||
511 | all attributes and content of the element where it is specified, unless overridden | ||
512 | with an instance of <code>xml:lang</code> on another element within that content. <span>In particular, the empty value of <code>xml:lang</code> is used on an element B to override a specification of <code>xml:lang</code> on an enclosing element A, without specifying another language. Within B, it is considered that there is no language information available, just as if <code>xml:lang</code> had not been specified on B or any of its ancestors.</span></p><div class="note"><p class="prefix"><b>Note:</b></p><p>Language information may also be provided by external transport protocols (e.g. HTTP or | ||
513 | MIME). When available, this information may be used by XML applications, but the more local | ||
514 | information provided by <code>xml:lang</code> should be considered to override it. | ||
515 | </p></div><p>A simple declaration for <code>xml:lang</code> might take the form</p><div class="exampleInner"><pre>xml:lang <span>CDATA</span> #IMPLIED</pre></div><p>but specific default values <em class="rfc2119" title="Keyword in RFC 2119 context">MAY</em> also be given, if appropriate. In a collection | ||
516 | of French poems for English students, with glosses and notes in English, the <code>xml:lang</code> | ||
517 | attribute might be declared this way:</p><div class="exampleInner"><pre><!ATTLIST poem xml:lang <span>CDATA</span> 'fr'> | ||
518 | <!ATTLIST gloss xml:lang <span>CDATA</span> 'en'> | ||
519 | <!ATTLIST note xml:lang <span>CDATA</span> 'en'></pre></div></div><div class="div2"> <h3><a name="sec-normalization-checking" id="sec-normalization-checking" />2.13 Normalization Checking</h3><p>All XML <a title="Text Entity" href="#dt-parsedent"> parsed | ||
520 | entities</a> (including <a title="Document Entity" href="#dt-docent"> document | ||
521 | entities</a>) <em class="rfc2119" title="Keyword in RFC 2119 context">SHOULD</em> be <a title="fully normalized" href="#dt-fullnorm">fully | ||
522 | normalized</a> as per the definition of | ||
523 | <a href="#sec-CharNorm"><b>B Definitions for Character Normalization</b></a> supplemented by the following definitions of | ||
524 | <em><a name="dt-relconst" id="dt-relconst" />relevant constructs</em> for XML:</p><ol type="1"><li><p>The <a title="Replacement Text" href="#dt-repltext"> | ||
525 | replacement text</a> of all <a title="Text Entity" href="#dt-parsedent">parsed | ||
526 | entities</a></p></li><li><p>All text matching, in context, one of the following | ||
527 | productions:</p><ol type="a"><li><p><a href="#NT-CData"> | ||
528 | CData</a></p></li><li><p><a href="#NT-CharData"> | ||
529 | CharData</a></p></li><li><p><a href="#NT-content"> | ||
530 | content</a></p></li><li><p><a href="#NT-Name"> Name</a></p></li><li><p><a href="#NT-Nmtoken"> | ||
531 | Nmtoken</a></p></li></ol></li></ol><p>However, a document is still well-formed even if it is not | ||
532 | <a title="fully normalized" href="#dt-fullnorm">fully normalized</a>. | ||
533 | XML processors <em class="rfc2119" title="Keyword in RFC 2119 context">SHOULD</em> provide a user option to verify that the document being | ||
534 | processed is in <a title="fully normalized" href="#dt-fullnorm">fully normalized</a> form, and report to the application whether | ||
535 | it is or not. The option to not verify <em class="rfc2119" title="Keyword in RFC 2119 context">SHOULD</em> be chosen only when the | ||
536 | input text is <a title="certified" href="#dt-certified">certified</a>, | ||
537 | as defined by <a href="#sec-CharNorm"><b>B Definitions for Character Normalization</b></a>.</p><p>The verification of full normalization <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> be carried out as if by | ||
538 | first verifying that the entity is in <a title="include-normalized" href="#dt-inclnorm">include-normalized</a> | ||
539 | form as defined by <a href="#sec-CharNorm"><b>B Definitions for Character Normalization</b></a> and by then verifying that none of the relevant | ||
540 | constructs listed above begins (after character references are | ||
541 | expanded) with a <a title="composing character" href="#dt-compchar">composing character</a> as defined by | ||
542 | <a href="#sec-CharNorm"><b>B Definitions for Character Normalization</b></a>. | ||
543 | Non-validating processors <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> ignore possible | ||
544 | denormalizations that would be caused by inclusion of external | ||
545 | entities that they do not read.</p><div class="note"><p class="prefix"><b>Note:</b></p><p>The <a title="composing character" href="#dt-compchar">composing character</a> are all | ||
546 | Unicode characters of non-zero combining class, plus a small number | ||
547 | of class-zero characters that nevertheless take part as a | ||
548 | non-initial character in certain Unicode canonical | ||
549 | decompositions. Since these characters are meant to follow | ||
550 | base characters, restricting relevant constructs (including | ||
551 | content) from beginning with a <a title="composing character" href="#dt-compchar">composing character</a> does not | ||
552 | meaningfully diminish the expressiveness of XML.</p></div><p>If, while verifying full normalization, a processor encounters | ||
553 | characters for which it cannot determine the normalization | ||
554 | properties (i.e., characters introduced in a version of Unicode <a href="#Unicode">[Unicode]</a> | ||
555 | later than the one used in the implementation of the processor), | ||
556 | then the processor <em class="rfc2119" title="Keyword in RFC 2119 context">MAY</em>, at user option, ignore any possible | ||
557 | denormalizations caused by these characters. The option to ignore | ||
558 | those denormalizations <em class="rfc2119" title="Keyword in RFC 2119 context">SHOULD NOT</em> be chosen by applications when | ||
559 | reliability or security are critical.</p><p> XML processors <em class="rfc2119" title="Keyword in RFC 2119 context">MUST NOT</em> transform the input to be in | ||
560 | <a title="fully normalized" href="#dt-fullnorm">fully normalized</a> form. | ||
561 | XML applications that create XML 1.1 output | ||
562 | from either XML 1.1 or XML 1.0 input <em class="rfc2119" title="Keyword in RFC 2119 context">SHOULD</em> ensure that the output | ||
563 | is <a title="fully normalized" href="#dt-fullnorm">fully normalized</a>; it is not necessary for internal processing | ||
564 | forms to be <a title="fully normalized" href="#dt-fullnorm">fully normalized</a>.</p><p>The purpose of this section is to strongly encourage XML | ||
565 | processors to ensure that the creators of XML documents have | ||
566 | properly normalized them, so that XML applications can make tests | ||
567 | such as identity comparisons of strings without having to worry | ||
568 | about the different possible "spellings" of strings which | ||
569 | Unicode allows. | ||
570 | </p><p>When entities are in a non-Unicode encoding, if the processor | ||
571 | transcodes them to Unicode, it <em class="rfc2119" title="Keyword in RFC 2119 context">SHOULD</em> use a normalizing transcoder. | ||
572 | </p></div></div><div class="div1"> <h2><a name="sec-logical-struct" id="sec-logical-struct" />3 Logical Structures</h2><p>[<a name="dt-element" id="dt-element" title="Element">Definition</a>: Each <a title="XML Document" href="#dt-xml-doc">XML | ||
573 | document</a> contains one or more <b>elements</b>, the boundaries | ||
574 | of which are either delimited by <a title="Start-Tag" href="#dt-stag">start-tags</a> | ||
575 | and <a title="End Tag" href="#dt-etag">end-tags</a>, or, for <a title="Empty" href="#dt-empty">empty</a> | ||
576 | elements, by an <a title="empty-element tag" href="#dt-eetag">empty-element tag</a>. Each | ||
577 | element has a type, identified by name, sometimes called its "generic | ||
578 | identifier" (GI), and <em class="rfc2119" title="Keyword in RFC 2119 context">MAY</em> have a set of attribute specifications.] | ||
579 | Each attribute specification has a <a title="Attribute Name" href="#dt-attrname">name</a> | ||
580 | and a <a title="Attribute Value" href="#dt-attrval">value</a>.</p> <h5><a name="IDATJ3S" id="IDATJ3S" />Element</h5><table class="scrap" summary="Scrap"><tbody><tr valign="baseline"><td><a name="NT-element" id="NT-element" />[39] </td><td><code>element</code></td><td> ::= </td><td><code><a href="#NT-EmptyElemTag">EmptyElemTag</a></code></td></tr><tr valign="baseline"><td /><td /><td /><td><code>| <a href="#NT-STag">STag</a> <a href="#NT-content">content</a> <a href="#NT-ETag">ETag</a></code></td><td><a href="#GIMatch">[WFC: Element Type Match]</a></td></tr><tr valign="baseline"><td /><td /><td /><td /><td><a href="#elementvalid">[VC: Element Valid]</a></td></tr></tbody></table><p>This specification does not constrain the semantics, use, or (beyond syntax) | ||
581 | names of the element types and attributes, except that names beginning with | ||
582 | a match to <code>(('X'|'x')('M'|'m')('L'|'l'))</code> are reserved for standardization | ||
583 | in this or future versions of this specification.</p><div class="constraint"><p class="prefix"><a name="GIMatch" id="GIMatch" /><b>Well-formedness constraint: Element Type Match</b></p><p>The <a href="#NT-Name">Name</a> | ||
584 | in an element's end-tag <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> match the element type in the start-tag.</p></div><div class="constraint"><p class="prefix"><a name="elementvalid" id="elementvalid" /><b>Validity constraint: Element Valid</b></p><p>An element is valid | ||
585 | if there is a declaration matching <a href="#NT-elementdecl">elementdecl</a> | ||
586 | where the <a href="#NT-Name">Name</a> matches the element type, and one of | ||
587 | the following holds:</p><ol type="1"><li><p>The declaration matches <b>EMPTY</b> and the element has no <a title="Content" href="#dt-content">content</a> <span>(not even entity | ||
588 | references, comments, PIs or white space)</span>.</p></li><li><p>The declaration matches <a href="#NT-children">children</a> and the | ||
589 | sequence of <a title="Parent/Child" href="#dt-parentchild">child elements</a> belongs | ||
590 | to the language generated by the regular expression in the content model, | ||
591 | with optional white space<span>, comments and | ||
592 | PIs (i.e. markup matching production [27] <a href="#NT-Misc">Misc</a>)</span> between the | ||
593 | start-tag and the first child element, between child elements, or between | ||
594 | the last child element and the end-tag. Note that a CDATA section containing | ||
595 | only white space <span>or a reference | ||
596 | to an entity whose replacement text is character references expanding to white | ||
597 | space</span> <span>do</span> not | ||
598 | match the nonterminal <a href="#NT-S">S</a>, and | ||
599 | hence cannot appear in these positions<span>; however, a | ||
600 | reference to an internal entity with a literal value consisting of character | ||
601 | references expanding to white space does match <a href="#NT-S">S</a>, since its | ||
602 | replacement text is the white space resulting from expansion of the character | ||
603 | references</span>.</p></li><li><p>The declaration matches <a href="#NT-Mixed">Mixed</a> and the content | ||
604 | <span>(after replacing | ||
605 | any entity references with their replacement text)</span> consists of | ||
606 | <a title="Character Data" href="#dt-chardata">character data</a><span>, | ||
607 | <a title="Comment" href="#dt-comment">comments</a>, <a title="Processing instruction" href="#dt-pi">PIs</a></span> and <a title="Parent/Child" href="#dt-parentchild">child elements</a> whose types match names in the | ||
608 | content model.</p></li><li><p>The declaration matches <b>ANY</b>, and the | ||
609 | <span>content | ||
610 | <span>(after replacing | ||
611 | any entity references with their replacement text)</span> | ||
612 | consists of character data and <a title="Parent/Child" href="#dt-parentchild">child elements</a> | ||
613 | whose types</span> | ||
614 | have been declared.</p></li></ol></div><div class="div2"> <h3><a name="sec-starttags" id="sec-starttags" />3.1 Start-Tags, End-Tags, and Empty-Element Tags</h3><p>[<a name="dt-stag" id="dt-stag" title="Start-Tag">Definition</a>: The beginning of every non-empty | ||
615 | XML element is marked by a <b>start-tag</b>.]</p> <h5><a name="IDA3O3S" id="IDA3O3S" />Start-tag</h5><table class="scrap" summary="Scrap"><tbody><tr valign="baseline"><td><a name="NT-STag" id="NT-STag" />[40] </td><td><code>STag</code></td><td> ::= </td><td><code>'<' <a href="#NT-Name">Name</a> (<a href="#NT-S">S</a> <a href="#NT-Attribute">Attribute</a>)* <a href="#NT-S">S</a>? '>'</code></td><td><a href="#uniqattspec">[WFC: Unique Att Spec]</a></td></tr><tr valign="baseline"><td><a name="NT-Attribute" id="NT-Attribute" />[41] </td><td><code>Attribute</code></td><td> ::= </td><td><code><a href="#NT-Name">Name</a> <a href="#NT-Eq">Eq</a> <a href="#NT-AttValue">AttValue</a></code></td><td><a href="#ValueType">[VC: Attribute Value Type]</a></td></tr><tr valign="baseline"><td /><td /><td /><td /><td><a href="#NoExternalRefs">[WFC: No External Entity References]</a></td></tr><tr valign="baseline"><td /><td /><td /><td /><td><a href="#CleanAttrVals">[WFC: No < in Attribute Values]</a></td></tr></tbody></table><p>The <a href="#NT-Name">Name</a> in the start- and end-tags gives the element's <b>type</b>. [<a name="dt-attr" id="dt-attr" title="Attribute">Definition</a>: The <a href="#NT-Name">Name</a>-<a href="#NT-AttValue">AttValue</a> | ||
616 | pairs are referred to as the <b>attribute specifications</b> of the | ||
617 | element], [<a name="dt-attrname" id="dt-attrname" title="Attribute Name">Definition</a>: with the <a href="#NT-Name">Name</a> in each pair referred to as the <b>attribute name</b>] | ||
618 | and [<a name="dt-attrval" id="dt-attrval" title="Attribute Value">Definition</a>: the content of the <a href="#NT-AttValue">AttValue</a> (the text between the <code>'</code> or <code>"</code> | ||
619 | delimiters) as the <b>attribute value</b>.] Note | ||
620 | that the order of attribute specifications in a start-tag or empty-element | ||
621 | tag is not significant.</p><div class="constraint"><p class="prefix"><a name="uniqattspec" id="uniqattspec" /><b>Well-formedness constraint: Unique Att Spec</b></p><p><span class="mustard">An attribute name | ||
622 | <em class="rfc2119" title="Keyword in RFC 2119 context">MUST NOT</em></span> appear more than once in the same start-tag or empty-element tag.</p></div><div class="constraint"><p class="prefix"><a name="ValueType" id="ValueType" /><b>Validity constraint: Attribute Value Type</b></p><p>The attribute <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> | ||
623 | have been declared; the value <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> be of the type declared for it. (For attribute | ||
624 | types, see <a href="#attdecls"><b>3.3 Attribute-List Declarations</b></a>.)</p></div><div class="constraint"><p class="prefix"><a name="NoExternalRefs" id="NoExternalRefs" /><b>Well-formedness constraint: No External Entity References</b></p><p>Attribute | ||
625 | values <span class="mustard"><em class="rfc2119" title="Keyword in RFC 2119 context">MUST NOT</em></span> contain direct or indirect entity references to external entities.</p></div><div class="constraint"><p class="prefix"><a name="CleanAttrVals" id="CleanAttrVals" /><b>Well-formedness constraint: No <code><</code> in Attribute Values</b></p><p>The <a title="Replacement Text" href="#dt-repltext">replacement text</a> of any entity | ||
626 | referred to directly or indirectly in an attribute value <em class="rfc2119" title="Keyword in RFC 2119 context">MUST NOT</em> contain a <code><</code>.</p></div><p>An example of a start-tag:</p><div class="exampleInner"><pre><termdef id="dt-dog" term="dog"></pre></div><p>[<a name="dt-etag" id="dt-etag" title="End Tag">Definition</a>: The end of every element that begins | ||
627 | with a start-tag <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> be marked by an <b>end-tag</b> containing a name | ||
628 | that echoes the element's type as given in the start-tag:]</p> <h5><a name="IDA3U3S" id="IDA3U3S" />End-tag</h5><table class="scrap" summary="Scrap"><tbody><tr valign="baseline"><td><a name="NT-ETag" id="NT-ETag" />[42] </td><td><code>ETag</code></td><td> ::= </td><td><code>'</' <a href="#NT-Name">Name</a> <a href="#NT-S">S</a>? | ||
629 | '>'</code></td></tr></tbody></table><p>An example of an end-tag:</p><div class="exampleInner"><pre></termdef></pre></div><p>[<a name="dt-content" id="dt-content" title="Content">Definition</a>: The <a title="Text" href="#dt-text">text</a> | ||
630 | between the start-tag and end-tag is called the element's <b>content</b>:]</p> <h5><a name="IDAKW3S" id="IDAKW3S" />Content of Elements</h5><table class="scrap" summary="Scrap"><tbody><tr valign="baseline"><td><a name="NT-content" id="NT-content" />[43] </td><td><code>content</code></td><td> ::= </td><td><code><a href="#NT-CharData">CharData</a>? ((<a href="#NT-element">element</a> | ||
631 | | <a href="#NT-Reference">Reference</a> | <a href="#NT-CDSect">CDSect</a> | ||
632 | | <a href="#NT-PI">PI</a> | <a href="#NT-Comment">Comment</a>) <a href="#NT-CharData">CharData</a>?)*</code></td></tr></tbody></table><p>[<a name="dt-empty" id="dt-empty" title="Empty">Definition</a>: An element | ||
633 | with no <a href="#NT-content">content</a> is said to be <b>empty</b>.] The representation | ||
634 | of an empty element is either a start-tag immediately followed by an end-tag, | ||
635 | or an empty-element tag. [<a name="dt-eetag" id="dt-eetag" title="empty-element tag">Definition</a>: An <b>empty-element | ||
636 | tag</b> takes a special form:]</p> <h5><a name="IDARY3S" id="IDARY3S" />Tags for Empty Elements</h5><table class="scrap" summary="Scrap"><tbody><tr valign="baseline"><td><a name="NT-EmptyElemTag" id="NT-EmptyElemTag" />[44] </td><td><code>EmptyElemTag</code></td><td> ::= </td><td><code>'<' <a href="#NT-Name">Name</a> (<a href="#NT-S">S</a> <a href="#NT-Attribute">Attribute</a>)* <a href="#NT-S">S</a>? '/>'</code></td><td><a href="#uniqattspec">[WFC: Unique Att Spec]</a></td></tr></tbody></table><p>Empty-element tags <em class="rfc2119" title="Keyword in RFC 2119 context">MAY</em> be used for any element which has no content, whether | ||
637 | or not it is declared using the keyword <b>EMPTY</b>. <a title="For interoperability" href="#dt-interop">For | ||
638 | interoperability</a>, the empty-element tag <em class="rfc2119" title="Keyword in RFC 2119 context">SHOULD</em> | ||
639 | be used, and <em class="rfc2119" title="Keyword in RFC 2119 context">SHOULD</em> only be used, for elements which are declared | ||
640 | EMPTY.</p><p>Examples of empty elements:</p><div class="exampleInner"><pre><IMG align="left" | ||
641 | src="http://www.w3.org/Icons/WWW/w3c_home" /> | ||
642 | <br></br> | ||
643 | <br/></pre></div></div><div class="div2"> <h3><a name="elemdecls" id="elemdecls" />3.2 Element Type Declarations</h3><p>The <a title="Element" href="#dt-element">element</a> structure of an <a title="XML Document" href="#dt-xml-doc">XML document</a> <em class="rfc2119" title="Keyword in RFC 2119 context">MAY</em>, for <a title="Validity" href="#dt-valid">validation</a> | ||
644 | purposes, be constrained using element type and attribute-list declarations. | ||
645 | An element type declaration constrains the element's <a title="Content" href="#dt-content">content</a>.</p><p>Element type declarations often constrain which element types can appear | ||
646 | as <a title="Parent/Child" href="#dt-parentchild">children</a> of the element. At user | ||
647 | option, an XML processor <em class="rfc2119" title="Keyword in RFC 2119 context">MAY</em> issue a warning when a declaration mentions an | ||
648 | element type for which no declaration is provided, but this is not an error.</p><p>[<a name="dt-eldecl" id="dt-eldecl" title="Element Type declaration">Definition</a>: An <b>element | ||
649 | type declaration</b> takes the form:]</p> <h5><a name="IDAV13S" id="IDAV13S" />Element Type Declaration</h5><table class="scrap" summary="Scrap"><tbody><tr valign="baseline"><td><a name="NT-elementdecl" id="NT-elementdecl" />[45] </td><td><code>elementdecl</code></td><td> ::= </td><td><code>'<!ELEMENT' <a href="#NT-S">S</a> <a href="#NT-Name">Name</a> <a href="#NT-S">S</a> <a href="#NT-contentspec">contentspec</a> <a href="#NT-S">S</a>? | ||
650 | '>'</code></td><td><a href="#EDUnique">[VC: Unique Element Type Declaration]</a></td></tr><tr valign="baseline"><td><a name="NT-contentspec" id="NT-contentspec" />[46] </td><td><code>contentspec</code></td><td> ::= </td><td><code>'EMPTY' | 'ANY' | <a href="#NT-Mixed">Mixed</a> | ||
651 | | <a href="#NT-children">children</a></code></td></tr></tbody></table><p>where the <a href="#NT-Name">Name</a> gives the element type being declared.</p><div class="constraint"><p class="prefix"><a name="EDUnique" id="EDUnique" /><b>Validity constraint: Unique Element Type Declaration</b></p><p><span class="mustard">An element | ||
652 | type <em class="rfc2119" title="Keyword in RFC 2119 context">MUST NOT</em></span> be declared more than once.</p></div><p>Examples of element type declarations:</p><div class="exampleInner"><pre><!ELEMENT br EMPTY> | ||
653 | <!ELEMENT p (#PCDATA|emph)* > | ||
654 | <!ELEMENT %name.para; %content.para; > | ||
655 | <!ELEMENT container ANY></pre></div><div class="div3"> <h4><a name="sec-element-content" id="sec-element-content" />3.2.1 Element Content</h4><p>[<a name="dt-elemcontent" id="dt-elemcontent" title="Element content">Definition</a>: An element <a title="Start-Tag" href="#dt-stag">type</a> has <b>element content</b> when elements | ||
656 | of that type <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> contain only <a title="Parent/Child" href="#dt-parentchild">child</a> | ||
657 | elements (no character data), optionally separated by white space (characters | ||
658 | matching the nonterminal <a href="#NT-S">S</a>).] [<a name="dt-content-model" id="dt-content-model" title="Content model">Definition</a>: In this case, the constraint includes a <b>content | ||
659 | model</b>, a simple grammar governing the allowed types of the | ||
660 | child elements and the order in which they are allowed to appear.] | ||
661 | The grammar is built on content particles (<a href="#NT-cp">cp</a>s), which | ||
662 | consist of names, choice lists of content particles, or sequence lists of | ||
663 | content particles:</p> <h5><a name="IDAP53S" id="IDAP53S" />Element-content Models</h5><table class="scrap" summary="Scrap"><tbody><tr valign="baseline"><td><a name="NT-children" id="NT-children" />[47] </td><td><code>children</code></td><td> ::= </td><td><code>(<a href="#NT-choice">choice</a> | <a href="#NT-seq">seq</a>) | ||
664 | ('?' | '*' | '+')?</code></td></tr><tr valign="baseline"><td><a name="NT-cp" id="NT-cp" />[48] </td><td><code>cp</code></td><td> ::= </td><td><code>(<a href="#NT-Name">Name</a> | <a href="#NT-choice">choice</a> | ||
665 | | <a href="#NT-seq">seq</a>) ('?' | '*' | '+')?</code></td></tr><tr valign="baseline"><td><a name="NT-choice" id="NT-choice" />[49] </td><td><code>choice</code></td><td> ::= </td><td><code>'(' <a href="#NT-S">S</a>? <a href="#NT-cp">cp</a> ( <a href="#NT-S">S</a>? '|' <a href="#NT-S">S</a>? <a href="#NT-cp">cp</a> )+ <a href="#NT-S">S</a>? ')'</code></td><td><a href="#vc-PEinGroup">[VC: Proper Group/PE Nesting]</a></td></tr><tr valign="baseline"><td><a name="NT-seq" id="NT-seq" />[50] </td><td><code>seq</code></td><td> ::= </td><td><code>'(' <a href="#NT-S">S</a>? <a href="#NT-cp">cp</a> ( <a href="#NT-S">S</a>? ',' <a href="#NT-S">S</a>? <a href="#NT-cp">cp</a> )* <a href="#NT-S">S</a>? ')'</code></td><td><a href="#vc-PEinGroup">[VC: Proper Group/PE Nesting]</a></td></tr></tbody></table><p>where each <a href="#NT-Name">Name</a> is the type of an element which | ||
666 | <em class="rfc2119" title="Keyword in RFC 2119 context">MAY</em> appear as a <a title="Parent/Child" href="#dt-parentchild">child</a>. Any content | ||
667 | particle in a choice list <em class="rfc2119" title="Keyword in RFC 2119 context">MAY</em> appear in the <a title="Element content" href="#dt-elemcontent">element | ||
668 | content</a> at the location where the choice list appears in the grammar; | ||
669 | content particles occurring in a sequence list <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> each appear in the <a title="Element content" href="#dt-elemcontent">element content</a> in the order given in the list. | ||
670 | The optional character following a name or list governs whether the element | ||
671 | or the content particles in the list may occur one or more (<code>+</code>), | ||
672 | zero or more (<code>*</code>), or zero or one times (<code>?</code>). The | ||
673 | absence of such an operator means that the element or content particle <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> | ||
674 | appear exactly once. This syntax and meaning are identical to those used in | ||
675 | the productions in this specification.</p><p>The content of an element matches a content model if and only if it is | ||
676 | possible to trace out a path through the content model, obeying the sequence, | ||
677 | choice, and repetition operators and matching each element in the content | ||
678 | against an element type in the content model. <a title="For Compatibility" href="#dt-compat">For | ||
679 | compatibility</a>, it is an error if <span>the content model | ||
680 | allows an element to match more than one occurrence of an element type in the | ||
681 | content model</span>. For more information, see <a href="#determinism"><b>D Deterministic Content Models</b></a>.</p><div class="constraint"><p class="prefix"><a name="vc-PEinGroup" id="vc-PEinGroup" /><b>Validity constraint: Proper Group/PE Nesting</b></p><p>Parameter-entity <a title="Replacement Text" href="#dt-repltext">replacement text</a> <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> be properly nested with parenthesized | ||
682 | groups. That is to say, if either of the opening or closing parentheses in | ||
683 | a <a href="#NT-choice">choice</a>, <a href="#NT-seq">seq</a>, or <a href="#NT-Mixed">Mixed</a> | ||
684 | construct is contained in the replacement text for a <a title="Parameter-entity reference" href="#dt-PERef">parameter | ||
685 | entity</a>, both <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> be contained in the same replacement text.</p><p><a title="For interoperability" href="#dt-interop">For interoperability</a>, if a parameter-entity reference | ||
686 | appears in a <a href="#NT-choice">choice</a>, <a href="#NT-seq">seq</a>, or <a href="#NT-Mixed">Mixed</a> construct, its replacement text <em class="rfc2119" title="Keyword in RFC 2119 context">SHOULD</em> contain at | ||
687 | least one non-blank character, and neither the first nor last non-blank character | ||
688 | of the replacement text <em class="rfc2119" title="Keyword in RFC 2119 context">SHOULD</em> be a connector (<code>|</code> or <code>,</code>).</p></div><p>Examples of element-content models:</p><div class="exampleInner"><pre><!ELEMENT spec (front, body, back?)> | ||
689 | <!ELEMENT div1 (head, (p | list | note)*, div2*)> | ||
690 | <!ELEMENT dictionary-body (%div.mix; | %dict.mix;)*></pre></div></div><div class="div3"> <h4><a name="sec-mixed-content" id="sec-mixed-content" />3.2.2 Mixed Content</h4><p>[<a name="dt-mixed" id="dt-mixed" title="Mixed Content">Definition</a>: An element <a title="Start-Tag" href="#dt-stag">type</a> | ||
691 | has <b>mixed content</b> when elements of that type <em class="rfc2119" title="Keyword in RFC 2119 context">MAY</em> contain character | ||
692 | data, optionally interspersed with <a title="Parent/Child" href="#dt-parentchild">child</a> | ||
693 | elements.] In this case, the types of the child elements <em class="rfc2119" title="Keyword in RFC 2119 context">MAY</em> be constrained, | ||
694 | but not their order or their number of occurrences:</p> <h5><a name="IDAUHCU" id="IDAUHCU" />Mixed-content Declaration</h5><table class="scrap" summary="Scrap"><tbody><tr valign="baseline"><td><a name="NT-Mixed" id="NT-Mixed" />[51] </td><td><code>Mixed</code></td><td> ::= </td><td><code>'(' <a href="#NT-S">S</a>? '#PCDATA' (<a href="#NT-S">S</a>? | ||
695 | '|' <a href="#NT-S">S</a>? <a href="#NT-Name">Name</a>)* <a href="#NT-S">S</a>? | ||
696 | ')*' </code></td></tr><tr valign="baseline"><td /><td /><td /><td><code>| '(' <a href="#NT-S">S</a>? '#PCDATA' <a href="#NT-S">S</a>? ')' </code></td><td><a href="#vc-PEinGroup">[VC: Proper Group/PE Nesting]</a></td></tr><tr valign="baseline"><td /><td /><td /><td /><td><a href="#vc-MixedChildrenUnique">[VC: No Duplicate Types]</a></td></tr></tbody></table><p>where the <a href="#NT-Name">Name</a>s give the types of elements that | ||
697 | may appear as children. The | ||
698 | keyword <b>#PCDATA</b> derives historically from the term "parsed | ||
699 | character data."</p><div class="constraint"><p class="prefix"><a name="vc-MixedChildrenUnique" id="vc-MixedChildrenUnique" /><b>Validity constraint: No Duplicate Types</b></p><p>The | ||
700 | same name <em class="rfc2119" title="Keyword in RFC 2119 context">MUST NOT</em> appear more than once in a single mixed-content declaration.</p></div><p>Examples of mixed content declarations:</p><div class="exampleInner"><pre><!ELEMENT p (#PCDATA|a|ul|b|i|em)*> | ||
701 | <!ELEMENT p (#PCDATA | %font; | %phrase; | %special; | %form;)* > | ||
702 | <!ELEMENT b (#PCDATA)></pre></div></div></div><div class="div2"> <h3><a name="attdecls" id="attdecls" />3.3 Attribute-List Declarations</h3><p><a title="Attribute" href="#dt-attr">Attributes</a> are used to associate name-value | ||
703 | pairs with <a title="Element" href="#dt-element">elements</a>. Attribute specifications | ||
704 | <span class="mustard"><em class="rfc2119" title="Keyword in RFC 2119 context">MUST NOT</em> appear outside of</span> <a title="Start-Tag" href="#dt-stag">start-tags</a> and <a title="empty-element tag" href="#dt-eetag">empty-element tags</a>; thus, the productions used to | ||
705 | recognize them appear in <a href="#sec-starttags"><b>3.1 Start-Tags, End-Tags, and Empty-Element Tags</b></a>. Attribute-list declarations | ||
706 | <em class="rfc2119" title="Keyword in RFC 2119 context">MAY</em> be used:</p><ul><li><p>To define the set of attributes pertaining to a given element type.</p></li><li><p>To establish type constraints for these attributes.</p></li><li><p>To provide <a title="Attribute Default" href="#dt-default">default values</a> for | ||
707 | attributes.</p></li></ul><p>[<a name="dt-attdecl" id="dt-attdecl" title="Attribute-List Declaration">Definition</a>: <b>Attribute-list | ||
708 | declarations</b> specify the name, data type, and default value (if any) | ||
709 | of each attribute associated with a given element type:]</p> <h5><a name="IDADMCU" id="IDADMCU" />Attribute-list Declaration</h5><table class="scrap" summary="Scrap"><tbody><tr valign="baseline"><td><a name="NT-AttlistDecl" id="NT-AttlistDecl" />[52] </td><td><code>AttlistDecl</code></td><td> ::= </td><td><code>'<!ATTLIST' <a href="#NT-S">S</a> <a href="#NT-Name">Name</a> <a href="#NT-AttDef">AttDef</a>* <a href="#NT-S">S</a>? '>'</code></td></tr></tbody><tbody><tr valign="baseline"><td><a name="NT-AttDef" id="NT-AttDef" />[53] </td><td><code>AttDef</code></td><td> ::= </td><td><code><a href="#NT-S">S</a> <a href="#NT-Name">Name</a> <a href="#NT-S">S</a> <a href="#NT-AttType">AttType</a> <a href="#NT-S">S</a> <a href="#NT-DefaultDecl">DefaultDecl</a></code></td></tr></tbody></table><p>The <a href="#NT-Name">Name</a> in the <a href="#NT-AttlistDecl">AttlistDecl</a> | ||
710 | rule is the type of an element. At user option, an XML processor <em class="rfc2119" title="Keyword in RFC 2119 context">MAY</em> issue | ||
711 | a warning if attributes are declared for an element type not itself declared, | ||
712 | but this is not an error. The <a href="#NT-Name">Name</a> in the <a href="#NT-AttDef">AttDef</a> | ||
713 | rule is the name of the attribute.</p><p>When more than one <a href="#NT-AttlistDecl">AttlistDecl</a> is provided | ||
714 | for a given element type, the contents of all those provided are merged. When | ||
715 | more than one definition is provided for the same attribute of a given element | ||
716 | type, the first declaration is binding and later declarations are ignored. <a title="For interoperability" href="#dt-interop">For interoperability,</a> writers of DTDs <em class="rfc2119" title="Keyword in RFC 2119 context">MAY</em> choose | ||
717 | to provide at most one attribute-list declaration for a given element type, | ||
718 | at most one attribute definition for a given attribute name in an attribute-list | ||
719 | declaration, and at least one attribute definition in each attribute-list | ||
720 | declaration. For interoperability, an XML processor <em class="rfc2119" title="Keyword in RFC 2119 context">MAY</em> at user option | ||
721 | issue a warning when more than one attribute-list declaration is provided | ||
722 | for a given element type, or more than one attribute definition is provided | ||
723 | for a given attribute, but this is not an error.</p><div class="div3"> <h4><a name="sec-attribute-types" id="sec-attribute-types" />3.3.1 Attribute Types</h4><p>XML attribute types are of three kinds: a string type, a set of tokenized | ||
724 | types, and enumerated types. The string type may take any literal string as | ||
725 | a value; the tokenized types have varying lexical and semantic constraints. | ||
726 | The validity constraints noted in the grammar are applied after the attribute | ||
727 | value has been normalized as described in <span><a href="#AVNormalize"><b>3.3.3 Attribute-Value Normalization</b></a></span>.</p> <h5><a name="IDAPPCU" id="IDAPPCU" />Attribute Types</h5><table class="scrap" summary="Scrap"><tbody><tr valign="baseline"><td><a name="NT-AttType" id="NT-AttType" />[54] </td><td><code>AttType</code></td><td> ::= </td><td><code><a href="#NT-StringType">StringType</a> | <a href="#NT-TokenizedType">TokenizedType</a> | ||
728 | | <a href="#NT-EnumeratedType">EnumeratedType</a></code></td></tr><tr valign="baseline"><td><a name="NT-StringType" id="NT-StringType" />[55] </td><td><code>StringType</code></td><td> ::= </td><td><code>'CDATA'</code></td></tr><tr valign="baseline"><td><a name="NT-TokenizedType" id="NT-TokenizedType" />[56] </td><td><code>TokenizedType</code></td><td> ::= </td><td><code>'ID'</code></td><td><a href="#id">[VC: ID]</a></td></tr><tr valign="baseline"><td /><td /><td /><td /><td><a href="#one-id-per-el">[VC: One ID per Element Type]</a></td></tr><tr valign="baseline"><td /><td /><td /><td /><td><a href="#id-default">[VC: ID Attribute Default]</a></td></tr><tr valign="baseline"><td /><td /><td /><td><code>| 'IDREF'</code></td><td><a href="#idref">[VC: IDREF]</a></td></tr><tr valign="baseline"><td /><td /><td /><td><code>| 'IDREFS'</code></td><td><a href="#idref">[VC: IDREF]</a></td></tr><tr valign="baseline"><td /><td /><td /><td><code>| 'ENTITY'</code></td><td><a href="#entname">[VC: Entity Name]</a></td></tr><tr valign="baseline"><td /><td /><td /><td><code>| 'ENTITIES'</code></td><td><a href="#entname">[VC: Entity Name]</a></td></tr><tr valign="baseline"><td /><td /><td /><td><code>| 'NMTOKEN'</code></td><td><a href="#nmtok">[VC: Name Token]</a></td></tr><tr valign="baseline"><td /><td /><td /><td><code>| 'NMTOKENS'</code></td><td><a href="#nmtok">[VC: Name Token]</a></td></tr></tbody></table><div class="constraint"><p class="prefix"><a name="id" id="id" /><b>Validity constraint: ID</b></p><p>Values of type <b>ID</b> <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> match the <a href="#NT-Name">Name</a> production. A name <em class="rfc2119" title="Keyword in RFC 2119 context">MUST NOT</em> appear more than once | ||
729 | in an XML document as a value of this type; i.e., ID values <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> uniquely | ||
730 | identify the elements which bear them.</p></div><div class="constraint"><p class="prefix"><a name="one-id-per-el" id="one-id-per-el" /><b>Validity constraint: One ID per Element Type</b></p><p><span class="mustard">An element | ||
731 | type <em class="rfc2119" title="Keyword in RFC 2119 context">MUST NOT</em></span> have more than one ID attribute specified.</p></div><div class="constraint"><p class="prefix"><a name="id-default" id="id-default" /><b>Validity constraint: ID Attribute Default</b></p><p>An ID attribute | ||
732 | <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> have a declared default of <b>#IMPLIED</b> or <b>#REQUIRED</b>.</p></div><div class="constraint"><p class="prefix"><a name="idref" id="idref" /><b>Validity constraint: IDREF</b></p><p>Values of type <b>IDREF</b> <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> | ||
733 | match the <a href="#NT-Name">Name</a> production, and values of type <b>IDREFS</b> <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> match <a href="#NT-Names">Names</a>; each <a href="#NT-Name">Name</a> <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> match the value of an ID attribute on some element in the XML document; | ||
734 | i.e. <b>IDREF</b> values <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> match the value of some ID attribute.</p></div><div class="constraint"><p class="prefix"><a name="entname" id="entname" /><b>Validity constraint: Entity Name</b></p><p>Values of type <b>ENTITY</b> <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> match the <a href="#NT-Name">Name</a> production, values of type <b>ENTITIES</b> <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> match <a href="#NT-Names">Names</a>; each <a href="#NT-Name">Name</a> <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> match the name of an <a title="Unparsed Entity" href="#dt-unparsed">unparsed entity</a> | ||
735 | declared in the <a title="Document Type Declaration" href="#dt-doctype">DTD</a>.</p></div><div class="constraint"><p class="prefix"><a name="nmtok" id="nmtok" /><b>Validity constraint: Name Token</b></p><p>Values of type <b>NMTOKEN</b> <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> match the <a href="#NT-Nmtoken">Nmtoken</a> production; values of type <b>NMTOKENS</b> <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> match <a href="#NT-Nmtokens">Nmtokens</a>.</p></div><p>[<a name="dt-enumerated" id="dt-enumerated" title="Enumerated Attribute
Values">Definition</a>: <b>Enumerated attributes</b> <span class="mustard"><em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em></span> take one of a list of values | ||
736 | provided in the declaration]. There are two kinds of enumerated types:</p> <h5><a name="IDAHXCU" id="IDAHXCU" />Enumerated Attribute Types</h5><table class="scrap" summary="Scrap"><tbody><tr valign="baseline"><td><a name="NT-EnumeratedType" id="NT-EnumeratedType" />[57] </td><td><code>EnumeratedType</code></td><td> ::= </td><td><code><a href="#NT-NotationType">NotationType</a> | ||
737 | | <a href="#NT-Enumeration">Enumeration</a></code></td></tr></tbody><tbody><tr valign="baseline"><td><a name="NT-NotationType" id="NT-NotationType" />[58] </td><td><code>NotationType</code></td><td> ::= </td><td><code>'NOTATION' <a href="#NT-S">S</a> '(' <a href="#NT-S">S</a>? <a href="#NT-Name">Name</a> (<a href="#NT-S">S</a>? '|' <a href="#NT-S">S</a>? <a href="#NT-Name">Name</a>)* <a href="#NT-S">S</a>? ')' </code></td><td><a href="#notatn">[VC: Notation Attributes]</a></td></tr><tr valign="baseline"><td /><td /><td /><td /><td><a href="#OneNotationPer">[VC: One Notation Per Element Type]</a></td></tr><tr valign="baseline"><td /><td /><td /><td /><td><a href="#NoNotationEmpty">[VC: No Notation on Empty Element]</a></td></tr><tr valign="baseline"><td /><td /><td /><td /><td><a href="#NoDuplicateTokens">[VC: No Duplicate | ||
738 | Tokens]</a></td></tr></tbody><tbody><tr valign="baseline"><td><a name="NT-Enumeration" id="NT-Enumeration" />[59] </td><td><code>Enumeration</code></td><td> ::= </td><td><code>'(' <a href="#NT-S">S</a>? <a href="#NT-Nmtoken">Nmtoken</a> | ||
739 | (<a href="#NT-S">S</a>? '|' <a href="#NT-S">S</a>? <a href="#NT-Nmtoken">Nmtoken</a>)* <a href="#NT-S">S</a>? ')'</code></td><td><a href="#enum">[VC: Enumeration]</a></td></tr><tr valign="baseline"><td /><td /><td /><td /><td><a href="#NoDuplicateTokens">[VC: No Duplicate | ||
740 | Tokens]</a></td></tr></tbody></table><p>A <b>NOTATION</b> attribute identifies a <a title="Notation" href="#dt-notation">notation</a>, | ||
741 | declared in the DTD with associated system and/or public identifiers, to be | ||
742 | used in interpreting the element to which the attribute is attached.</p><div class="constraint"><p class="prefix"><a name="notatn" id="notatn" /><b>Validity constraint: Notation Attributes</b></p><p>Values of this type | ||
743 | <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> match one of the <a href="#Notations"><cite>notation</cite></a> names | ||
744 | included in the declaration; all notation names in the declaration <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> be | ||
745 | declared.</p></div><div class="constraint"><p class="prefix"><a name="OneNotationPer" id="OneNotationPer" /><b>Validity constraint: One Notation Per Element Type</b></p><p><span class="mustard">An element type <em class="rfc2119" title="Keyword in RFC 2119 context">MUST NOT</em></span> have more than one <b>NOTATION</b> | ||
746 | attribute specified.</p></div><div class="constraint"><p class="prefix"><a name="NoNotationEmpty" id="NoNotationEmpty" /><b>Validity constraint: No Notation on Empty Element</b></p><p><a title="For Compatibility" href="#dt-compat">For compatibility</a>, | ||
747 | an attribute of type <b>NOTATION</b> <em class="rfc2119" title="Keyword in RFC 2119 context">MUST NOT</em> be declared on an element | ||
748 | declared <b>EMPTY</b>.</p></div><div class="constraint"><p class="prefix"><a name="NoDuplicateTokens" id="NoDuplicateTokens" /><b>Validity constraint: No Duplicate | ||
749 | Tokens</b></p><p>The notation names in a single <a href="#NT-NotationType">NotationType</a> | ||
750 | attribute declaration, as well as the <a href="#NT-Nmtoken">NmToken</a>s in a single | ||
751 | <a href="#NT-Enumeration">Enumeration</a> attribute declaration, <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> all be distinct.</p></div><div class="constraint"><p class="prefix"><a name="enum" id="enum" /><b>Validity constraint: Enumeration</b></p><p>Values of this type <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> match | ||
752 | one of the <a href="#NT-Nmtoken">Nmtoken</a> tokens in the declaration.</p></div><p><a title="For interoperability" href="#dt-interop">For interoperability,</a> the same <a href="#NT-Nmtoken">Nmtoken</a> <em class="rfc2119" title="Keyword in RFC 2119 context">SHOULD NOT</em> occur more than once in the enumerated | ||
753 | attribute types of a single element type.</p></div><div class="div3"> <h4><a name="sec-attr-defaults" id="sec-attr-defaults" />3.3.2 Attribute Defaults</h4><p>An <a title="Attribute-List Declaration" href="#dt-attdecl">attribute declaration</a> provides information | ||
754 | on whether the attribute's presence is <em class="rfc2119" title="Keyword in RFC 2119 context">REQUIRED</em>, and if not, how an XML processor | ||
755 | <span>is | ||
756 | to</span> react if a declared attribute is absent in a document.</p> <h5><a name="IDAR4CU" id="IDAR4CU" />Attribute Defaults</h5><table class="scrap" summary="Scrap"><tbody><tr valign="baseline"><td><a name="NT-DefaultDecl" id="NT-DefaultDecl" />[60] </td><td><code>DefaultDecl</code></td><td> ::= </td><td><code>'#REQUIRED' | '#IMPLIED' </code></td></tr><tr valign="baseline"><td /><td /><td /><td><code>| (('#FIXED' S)? <a href="#NT-AttValue">AttValue</a>)</code></td><td><a href="#RequiredAttr">[VC: Required Attribute]</a></td></tr><tr valign="baseline"><td /><td /><td /><td /><td><a href="#defattrvalid">[VC: Attribute | ||
757 | Default Value Syntactically Correct]</a></td></tr><tr valign="baseline"><td /><td /><td /><td /><td><a href="#CleanAttrVals">[WFC: No < in Attribute Values]</a></td></tr><tr valign="baseline"><td /><td /><td /><td /><td><a href="#FixedAttr">[VC: Fixed Attribute Default]</a></td></tr></tbody></table><p>In an attribute declaration, <b>#REQUIRED</b> means that the attribute | ||
758 | <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> always be provided, <b>#IMPLIED</b> that no default value is provided. | ||
759 | [<a name="dt-default" id="dt-default" title="Attribute Default">Definition</a>: If | ||
760 | the declaration is neither <b>#REQUIRED</b> nor <b>#IMPLIED</b>, then | ||
761 | the <a href="#NT-AttValue">AttValue</a> value contains the declared <b>default</b> | ||
762 | value; the <b>#FIXED</b> keyword states that the attribute <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> always have | ||
763 | the default value. | ||
764 | When an XML processor encounters | ||
765 | an <span>element | ||
766 | without a specification for an attribute for which it has read a default | ||
767 | value declaration, it <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> report the attribute with the declared default | ||
768 | value to the application</span>.]</p><div class="constraint"><p class="prefix"><a name="RequiredAttr" id="RequiredAttr" /><b>Validity constraint: Required Attribute</b></p><p>If the default | ||
769 | declaration is the keyword <b>#REQUIRED</b>, then the attribute <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> be | ||
770 | specified for all elements of the type in the attribute-list declaration.</p></div><div class="constraint"><p class="prefix"><a name="defattrvalid" id="defattrvalid" /><b>Validity constraint: <span>Attribute | ||
771 | Default Value Syntactically Correct</span></b></p><p>The declared default value <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> meet the <span>syntactic</span> | ||
772 | constraints of the declared attribute type.</p><p>Note that only the | ||
773 | syntactic constraints of the type are required here; other constraints (e.g. | ||
774 | that the value be the name of a declared unparsed entity, for an attribute of | ||
775 | type ENTITY) may come into play if the declared default value is actually used | ||
776 | (an element without a specification for this attribute occurs).</p></div><div class="constraint"><p class="prefix"><a name="FixedAttr" id="FixedAttr" /><b>Validity constraint: Fixed Attribute Default</b></p><p>If an attribute | ||
777 | has a default value declared with the <b>#FIXED</b> keyword, instances of | ||
778 | that attribute <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> match the default value.</p></div><p>Examples of attribute-list declarations:</p><div class="exampleInner"><pre><!ATTLIST termdef | ||
779 | id ID #REQUIRED | ||
780 | name CDATA #IMPLIED> | ||
781 | <!ATTLIST list | ||
782 | type (bullets|ordered|glossary) "ordered"> | ||
783 | <!ATTLIST form | ||
784 | method CDATA #FIXED "POST"></pre></div></div><div class="div3"> <h4><a name="AVNormalize" id="AVNormalize" />3.3.3 Attribute-Value Normalization</h4><p>Before the value of an attribute is passed to the application or checked | ||
785 | for validity, the XML processor <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> normalize the attribute value by applying | ||
786 | the algorithm below, or by using some other method such that the value passed | ||
787 | to the application is the same as that produced by the algorithm.</p><ol type="1"><li><p>All line breaks <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> have been normalized on input to #xA as described | ||
788 | in <a href="#sec-line-ends"><b>2.11 End-of-Line Handling</b></a>, so the rest of this algorithm operates | ||
789 | on text normalized in this way.</p></li><li><p>Begin with a normalized value consisting of the empty string.</p></li><li><p>For each character, entity reference, or character reference in the | ||
790 | unnormalized attribute value, beginning with the first and continuing to the | ||
791 | last, do the following:</p><ul><li><p>For a character reference, append the referenced character to the | ||
792 | normalized value.</p></li><li><p>For an entity reference, recursively apply step 3 of this algorithm | ||
793 | to the replacement text of the entity.</p></li><li><p>For a white space character (#x20, #xD, #xA, #x9), append a space | ||
794 | character (#x20) to the normalized value.</p></li><li><p>For another character, append the character to the normalized value.</p></li></ul></li></ol><p>If the attribute type is not CDATA, then the XML processor <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> further | ||
795 | process the normalized attribute value by discarding any leading and trailing | ||
796 | space (#x20) characters, and by replacing sequences of space (#x20) characters | ||
797 | by a single space (#x20) character.</p><p>Note that if the unnormalized attribute value contains a character reference | ||
798 | to a white space character other than space (#x20), the normalized value contains | ||
799 | the referenced character itself (#xD, #xA or #x9). This contrasts with the | ||
800 | case where the unnormalized value contains a white space character (not a | ||
801 | reference), which is replaced with a space character (#x20) in the normalized | ||
802 | value and also contrasts with the case where the unnormalized value contains | ||
803 | an entity reference whose replacement text contains a white space character; | ||
804 | being recursively processed, the white space character is replaced with a | ||
805 | space character (#x20) in the normalized value.</p><p>All attributes for which no declaration has been read <em class="rfc2119" title="Keyword in RFC 2119 context">SHOULD</em> be treated | ||
806 | by a non-validating processor as if declared <b>CDATA</b>.</p><p>It | ||
807 | is an error if an | ||
808 | <span><a title="Attribute Value" href="#dt-attrval">attribute | ||
809 | value</a> contains a <a title="Entity Reference" href="#dt-entref">reference</a> to an | ||
810 | entity for which no declaration has been read.</span></p><p>Following are examples of attribute normalization. Given the following | ||
811 | declarations:</p><div class="exampleInner"><pre><!ENTITY d "&#xD;"> | ||
812 | <!ENTITY a "&#xA;"> | ||
813 | <!ENTITY da "&#xD;&#xA;"></pre></div><p>the attribute specifications in the left column below would be normalized | ||
814 | to the character sequences of the middle column if the attribute <code>a</code> | ||
815 | is declared <b>NMTOKENS</b> and to those of the right columns if <code>a</code> | ||
816 | is declared <b>CDATA</b>.</p><table border="1" frame="border" summary="Attribute normalization summary"><thead><tr><th rowspan="1" colspan="1">Attribute specification</th><th rowspan="1" colspan="1">a is NMTOKENS</th><th rowspan="1" colspan="1">a is CDATA</th></tr></thead><tbody><tr><td rowspan="1" colspan="1"><div class="exampleInner"><pre>a=" | ||
817 | xyz"</pre></div></td><td rowspan="1" colspan="1"><div class="exampleInner"><pre>x y z</pre></div></td><td rowspan="1" colspan="1"><div class="exampleInner"><pre>#x20 #x20 x y z</pre></div></td></tr><tr><td rowspan="1" colspan="1"><div class="exampleInner"><pre>a="&d;&d;A&a;<span>&#x20;</span>&a;B&da;"</pre></div></td><td rowspan="1" colspan="1"><div class="exampleInner"><pre>A #x20 B</pre></div></td><td rowspan="1" colspan="1"><div class="exampleInner"><pre>#x20 #x20 A #x20 <span>#x20</span> #x20 B #x20 #x20</pre></div></td></tr><tr><td rowspan="1" colspan="1"><div class="exampleInner"><pre>a= | ||
818 | "&#xd;&#xd;A&#xa;&#xa;B&#xd;&#xa;"</pre></div></td><td rowspan="1" colspan="1"><div class="exampleInner"><pre>#xD #xD A #xA #xA B #xD #xA</pre></div></td><td rowspan="1" colspan="1"><div class="exampleInner"><pre>#xD #xD A #xA #xA B #xD #xA</pre></div></td></tr></tbody></table><p>Note that the last example is invalid (but well-formed) if <code>a</code> | ||
819 | is declared to be of type <b>NMTOKENS</b>.</p></div></div><div class="div2"> <h3><a name="sec-condition-sect" id="sec-condition-sect" />3.4 Conditional Sections</h3><p>[<a name="dt-cond-section" id="dt-cond-section" title="conditional section">Definition</a>: <b>Conditional | ||
820 | sections</b> are portions of the <a title="Document Type Declaration" href="#dt-doctype">document type | ||
821 | declaration external subset</a> <span>or | ||
822 | of external parameter entities </span>which are included in, or excluded from, | ||
823 | the logical structure of the DTD based on the keyword which governs them.]</p> <h5><a name="IDAMHDU" id="IDAMHDU" />Conditional Section</h5><table class="scrap" summary="Scrap"><tbody><tr valign="baseline"><td><a name="NT-conditionalSect" id="NT-conditionalSect" />[61] </td><td><code>conditionalSect</code></td><td> ::= </td><td><code><a href="#NT-includeSect">includeSect</a> | <a href="#NT-ignoreSect">ignoreSect</a></code></td></tr><tr valign="baseline"><td><a name="NT-includeSect" id="NT-includeSect" />[62] </td><td><code>includeSect</code></td><td> ::= </td><td><code>'<![' S? 'INCLUDE' S? '[' <a href="#NT-extSubsetDecl">extSubsetDecl</a> | ||
824 | ']]>' </code></td><td><a href="#condsec-nesting">[VC: Proper Conditional Section/PE Nesting]</a></td></tr><tr valign="baseline"><td><a name="NT-ignoreSect" id="NT-ignoreSect" />[63] </td><td><code>ignoreSect</code></td><td> ::= </td><td><code>'<![' S? 'IGNORE' S? '[' <a href="#NT-ignoreSectContents">ignoreSectContents</a>* | ||
825 | ']]>'</code></td><td><a href="#condsec-nesting">[VC: Proper Conditional Section/PE Nesting]</a></td></tr><tr valign="baseline"><td><a name="NT-ignoreSectContents" id="NT-ignoreSectContents" />[64] </td><td><code>ignoreSectContents</code></td><td> ::= </td><td><code><a href="#NT-Ignore">Ignore</a> ('<![' <a href="#NT-ignoreSectContents">ignoreSectContents</a> ']]>' <a href="#NT-Ignore">Ignore</a>)*</code></td></tr><tr valign="baseline"><td><a name="NT-Ignore" id="NT-Ignore" />[65] </td><td><code>Ignore</code></td><td> ::= </td><td><code><a href="#NT-Char">Char</a>* - (<a href="#NT-Char">Char</a>* | ||
826 | ('<![' | ']]>') <a href="#NT-Char">Char</a>*) </code></td></tr></tbody></table><div class="constraint"><p class="prefix"><a name="condsec-nesting" id="condsec-nesting" /><b>Validity constraint: Proper Conditional Section/PE Nesting</b></p><p>If any of the "<code><![</code>", | ||
827 | "<code>[</code>", or "<code>]]></code>" of a conditional section is contained | ||
828 | in the replacement text for a parameter-entity reference, all of them <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> | ||
829 | be contained in the same replacement text.</p></div><p>Like the internal and external DTD subsets, a conditional section may contain | ||
830 | one or more complete declarations, comments, processing instructions, or nested | ||
831 | conditional sections, intermingled with white space.</p><p>If the keyword of the conditional section is <b>INCLUDE</b>, then the | ||
832 | contents of the conditional section <span class="mustard"><em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> be considered</span> part of the DTD. If the keyword of | ||
833 | the conditional section is <b>IGNORE</b>, then the contents of the conditional | ||
834 | section <span class="mustard"><em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> be considered as</span> not logically part of the DTD. | ||
835 | If a conditional section with a keyword of <b>INCLUDE</b> occurs within | ||
836 | a larger conditional section with a keyword of <b>IGNORE</b>, both the outer | ||
837 | and the inner conditional sections <span class="mustard"><em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> be</span> ignored. The contents | ||
838 | of an ignored conditional section <span class="mustard"><em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> be</span> parsed by ignoring all characters after | ||
839 | the "<code>[</code>" following the keyword, except conditional section starts | ||
840 | "<code><![</code>" and ends "<code>]]></code>", until the matching conditional | ||
841 | section end is found. Parameter entity references <span class="mustard"><em class="rfc2119" title="Keyword in RFC 2119 context">MUST NOT</em> be</span> recognized in this | ||
842 | process.</p><p>If the keyword of the conditional section is a parameter-entity reference, | ||
843 | the parameter entity <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> be replaced by its content before the processor | ||
844 | decides whether to include or ignore the conditional section.</p><p>An example:</p><div class="exampleInner"><pre><!ENTITY % draft 'INCLUDE' > | ||
845 | <!ENTITY % final 'IGNORE' > | ||
846 | <![%draft;[ | ||
847 | <!ELEMENT book (comments*, title, body, supplements?)> | ||
848 | ]]> | ||
849 | <![%final;[ | ||
850 | <!ELEMENT book (title, body, supplements?)> | ||
851 | ]]></pre></div></div></div><div class="div1"> <h2><a name="sec-physical-struct" id="sec-physical-struct" />4 Physical Structures</h2><p>[<a name="dt-entity" id="dt-entity" title="Entity">Definition</a>: An XML document may consist of one | ||
852 | or many storage units. These | ||
853 | are called <b>entities</b>; they all have <b>content</b> and are | ||
854 | all (except for the <a title="Document Entity" href="#dt-docent">document entity</a> and | ||
855 | the <a title="Document Type Declaration" href="#dt-doctype">external DTD subset</a>) identified by | ||
856 | entity <b>name</b>.] Each XML document has one entity | ||
857 | called the <a title="Document Entity" href="#dt-docent">document entity</a>, which serves | ||
858 | as the starting point for the <a title="XML Processor" href="#dt-xml-proc">XML processor</a> | ||
859 | and may contain the whole document.</p><p>Entities may be either parsed or unparsed. [<a name="dt-parsedent" id="dt-parsedent" title="Text Entity">Definition</a>: The contents of a <b>parsed | ||
860 | entity</b> are referred to as its <a title="Replacement Text" href="#dt-repltext">replacement | ||
861 | text</a>; this <a title="Text" href="#dt-text">text</a> is considered an | ||
862 | integral part of the document.]</p><p>[<a name="dt-unparsed" id="dt-unparsed" title="Unparsed Entity">Definition</a>: An <b>unparsed entity</b> | ||
863 | is a resource whose contents may or may not be <a title="Text" href="#dt-text">text</a>, | ||
864 | and if text, may | ||
865 | be other than XML. Each unparsed entity has an associated <a title="Notation" href="#dt-notation">notation</a>, identified by name. Beyond a requirement | ||
866 | that an XML processor make the identifiers for the entity and notation available | ||
867 | to the application, XML places no constraints on the contents of unparsed | ||
868 | entities.]</p><p>Parsed entities are invoked by name using entity references; unparsed entities | ||
869 | by name, given in the value of <b>ENTITY</b> or <b>ENTITIES</b> attributes.</p><p>[<a name="gen-entity" id="gen-entity" title="general entity">Definition</a>: <b>General entities</b> | ||
870 | are entities for use within the document content. In this specification, general | ||
871 | entities are sometimes referred to with the unqualified term <em>entity</em> | ||
872 | when this leads to no ambiguity.] [<a name="dt-PE" id="dt-PE" title="Parameter entity">Definition</a>: <b>Parameter | ||
873 | entities</b> are parsed entities for use within the DTD.] | ||
874 | These two types of entities use different forms of reference and are recognized | ||
875 | in different contexts. Furthermore, they occupy different namespaces; a parameter | ||
876 | entity and a general entity with the same name are two distinct entities.</p><div class="div2"> <h3><a name="sec-references" id="sec-references" />4.1 Character and Entity References</h3><p>[<a name="dt-charref" id="dt-charref" title="Character Reference">Definition</a>: A <b>character | ||
877 | reference</b> refers to a specific character in the ISO/IEC 10646 character | ||
878 | set, for example one not directly accessible from available input devices.]</p> <h5><a name="IDAFYDU" id="IDAFYDU" />Character Reference</h5><table class="scrap" summary="Scrap"><tbody><tr valign="baseline"><td><a name="NT-CharRef" id="NT-CharRef" />[66] </td><td><code>CharRef</code></td><td> ::= </td><td><code>'&#' [0-9]+ ';' </code></td></tr><tr valign="baseline"><td /><td /><td /><td><code>| '&#x' [0-9a-fA-F]+ ';'</code></td><td><a href="#wf-Legalchar">[WFC: Legal Character]</a></td></tr></tbody></table><div class="constraint"><p class="prefix"><a name="wf-Legalchar" id="wf-Legalchar" /><b>Well-formedness constraint: Legal Character</b></p><p>Characters referred | ||
879 | to using character references <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> match the production for <a href="#NT-Char">Char</a>.</p></div><p>If the character reference begins with "<code>&#x</code>", | ||
880 | the digits and letters up to the terminating <code>;</code> provide a hexadecimal | ||
881 | representation of the character's code point in ISO/IEC 10646. If it begins | ||
882 | just with "<code>&#</code>", the digits up to the terminating <code>;</code> | ||
883 | provide a decimal representation of the character's code point.</p><p>[<a name="dt-entref" id="dt-entref" title="Entity Reference">Definition</a>: An <b>entity reference</b> | ||
884 | refers to the content of a named entity.] [<a name="dt-GERef" id="dt-GERef" title="General Entity Reference">Definition</a>: References to parsed general entities use | ||
885 | ampersand (<code>&</code>) and semicolon (<code>;</code>) as delimiters.] [<a name="dt-PERef" id="dt-PERef" title="Parameter-entity reference">Definition</a>: <b>Parameter-entity references</b> | ||
886 | use percent-sign (<code>%</code>) and semicolon (<code>;</code>) as delimiters.]</p> <h5><a name="IDAS0DU" id="IDAS0DU" />Entity Reference</h5><table class="scrap" summary="Scrap"><tbody><tr valign="baseline"><td><a name="NT-Reference" id="NT-Reference" />[67] </td><td><code>Reference</code></td><td> ::= </td><td><code><a href="#NT-EntityRef">EntityRef</a> | <a href="#NT-CharRef">CharRef</a></code></td></tr></tbody><tbody><tr valign="baseline"><td><a name="NT-EntityRef" id="NT-EntityRef" />[68] </td><td><code>EntityRef</code></td><td> ::= </td><td><code>'&' <a href="#NT-Name">Name</a> ';'</code></td><td><a href="#wf-entdeclared">[WFC: Entity Declared]</a></td></tr><tr valign="baseline"><td /><td /><td /><td /><td><a href="#vc-entdeclared">[VC: Entity Declared]</a></td></tr><tr valign="baseline"><td /><td /><td /><td /><td><a href="#textent">[WFC: Parsed Entity]</a></td></tr><tr valign="baseline"><td /><td /><td /><td /><td><a href="#norecursion">[WFC: No Recursion]</a></td></tr></tbody><tbody><tr valign="baseline"><td><a name="NT-PEReference" id="NT-PEReference" />[69] </td><td><code>PEReference</code></td><td> ::= </td><td><code>'%' <a href="#NT-Name">Name</a> ';'</code></td><td><a href="#vc-entdeclared">[VC: Entity Declared]</a></td></tr><tr valign="baseline"><td /><td /><td /><td /><td><a href="#norecursion">[WFC: No Recursion]</a></td></tr><tr valign="baseline"><td /><td /><td /><td /><td><a href="#indtd">[WFC: In DTD]</a></td></tr></tbody></table><div class="constraint"><p class="prefix"><a name="wf-entdeclared" id="wf-entdeclared" /><b>Well-formedness constraint: Entity Declared</b></p><p>In a document | ||
887 | without any DTD, a document with only an internal DTD subset which contains | ||
888 | no parameter entity references, or a document with "<code>standalone='yes'</code>", for | ||
889 | an entity reference that does not occur within the external subset or a parameter | ||
890 | entity, the <a href="#NT-Name">Name</a> given in the entity reference <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> <a title="match" href="#dt-match">match</a> that in an <a href="#sec-entity-decl"><cite>entity | ||
891 | declaration</cite></a> that does not occur within the external subset or a | ||
892 | parameter entity, except that well-formed documents need not declare | ||
893 | any of the following entities: <code>amp</code>, | ||
894 | <code>lt</code>, | ||
895 | <code>gt</code>, | ||
896 | <code>apos</code>, | ||
897 | <code>quot</code>. The | ||
898 | declaration of a general entity <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> precede any reference to it which appears | ||
899 | in a default value in an attribute-list declaration.</p><p><span>Note | ||
900 | that non-validating processors are <a href="#include-if-valid"><cite>not | ||
901 | obligated to</cite></a> to read and process entity declarations occurring in parameter entities or in | ||
902 | the external subset</span>; for such documents, | ||
903 | the rule that an entity must be declared is a well-formedness constraint only | ||
904 | if <a href="#sec-rmd"><cite>standalone='yes'</cite></a>.</p></div><div class="constraint"><p class="prefix"><a name="vc-entdeclared" id="vc-entdeclared" /><b>Validity constraint: Entity Declared</b></p><p>In a document with | ||
905 | an external subset or external parameter entities with "<code>standalone='no'</code>", | ||
906 | the <a href="#NT-Name">Name</a> given in the entity reference <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> <a title="match" href="#dt-match">match</a> that in an <a href="#sec-entity-decl"><cite>entity | ||
907 | declaration</cite></a>. For interoperability, valid documents <em class="rfc2119" title="Keyword in RFC 2119 context">SHOULD</em> declare | ||
908 | the entities <code>amp</code>, | ||
909 | <code>lt</code>, | ||
910 | <code>gt</code>, | ||
911 | <code>apos</code>, | ||
912 | <code>quot</code>, in the form specified in <a href="#sec-predefined-ent"><b>4.6 Predefined Entities</b></a>. | ||
913 | The declaration of a parameter entity <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> precede any reference to it. Similarly, | ||
914 | the declaration of a general entity <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> precede any attribute-list | ||
915 | declaration containing a default value with a direct or indirect reference | ||
916 | to that general entity.</p></div><div class="constraint"><p class="prefix"><a name="textent" id="textent" /><b>Well-formedness constraint: Parsed Entity</b></p><p>An entity reference <em class="rfc2119" title="Keyword in RFC 2119 context">MUST | ||
917 | NOT</em> contain the name of an <a title="Unparsed Entity" href="#dt-unparsed">unparsed entity</a>. | ||
918 | Unparsed entities may be referred to only in <a title="Attribute Value" href="#dt-attrval">attribute | ||
919 | values</a> declared to be of type <b>ENTITY</b> or <b>ENTITIES</b>.</p></div><div class="constraint"><p class="prefix"><a name="norecursion" id="norecursion" /><b>Well-formedness constraint: No Recursion</b></p><p>A parsed entity <em class="rfc2119" title="Keyword in RFC 2119 context">MUST NOT</em> contain a recursive reference to itself, either directly or indirectly.</p></div><div class="constraint"><p class="prefix"><a name="indtd" id="indtd" /><b>Well-formedness constraint: In DTD</b></p><p>Parameter-entity references <span class="mustard"><em class="rfc2119" title="Keyword in RFC 2119 context">MUST NOT</em> appear outside</span> | ||
920 | the <a title="Document Type Declaration" href="#dt-doctype">DTD</a>.</p></div><p>Examples of character and entity references:</p><div class="exampleInner"><pre>Type <key>less-than</key> (&#x3C;) to save options. | ||
921 | This document was prepared on &docdate; and | ||
922 | is classified &security-level;.</pre></div><p>Example of a parameter-entity reference:</p><div class="exampleInner"><pre><!-- declare the parameter entity "ISOLat2"... --> | ||
923 | <!ENTITY % ISOLat2 | ||
924 | SYSTEM "http://www.xml.com/iso/isolat2-xml.entities" > | ||
925 | <!-- ... now reference it. --> | ||
926 | %ISOLat2;</pre></div></div><div class="div2"> <h3><a name="sec-entity-decl" id="sec-entity-decl" />4.2 Entity Declarations</h3><p>[<a name="dt-entdecl" id="dt-entdecl" title="entity declaration">Definition</a>: Entities are declared | ||
927 | thus:]</p> <h5><a name="IDAECEU" id="IDAECEU" />Entity Declaration</h5><table class="scrap" summary="Scrap"><tbody><tr valign="baseline"><td><a name="NT-EntityDecl" id="NT-EntityDecl" />[70] </td><td><code>EntityDecl</code></td><td> ::= </td><td><code><a href="#NT-GEDecl">GEDecl</a> | <a href="#NT-PEDecl">PEDecl</a></code></td></tr><tr valign="baseline"><td><a name="NT-GEDecl" id="NT-GEDecl" />[71] </td><td><code>GEDecl</code></td><td> ::= </td><td><code>'<!ENTITY' <a href="#NT-S">S</a> <a href="#NT-Name">Name</a> <a href="#NT-S">S</a> <a href="#NT-EntityDef">EntityDef</a> <a href="#NT-S">S</a>? | ||
928 | '>'</code></td></tr><tr valign="baseline"><td><a name="NT-PEDecl" id="NT-PEDecl" />[72] </td><td><code>PEDecl</code></td><td> ::= </td><td><code>'<!ENTITY' <a href="#NT-S">S</a> '%' <a href="#NT-S">S</a> <a href="#NT-Name">Name</a> <a href="#NT-S">S</a> <a href="#NT-PEDef">PEDef</a> <a href="#NT-S">S</a>? '>'</code></td></tr><tr valign="baseline"><td><a name="NT-EntityDef" id="NT-EntityDef" />[73] </td><td><code>EntityDef</code></td><td> ::= </td><td><code><a href="#NT-EntityValue">EntityValue</a>| (<a href="#NT-ExternalID">ExternalID</a> <a href="#NT-NDataDecl">NDataDecl</a>?)</code></td></tr><tr valign="baseline"><td><a name="NT-PEDef" id="NT-PEDef" />[74] </td><td><code>PEDef</code></td><td> ::= </td><td><code><a href="#NT-EntityValue">EntityValue</a> | <a href="#NT-ExternalID">ExternalID</a></code></td></tr></tbody></table><p>The <a href="#NT-Name">Name</a> identifies the entity in an <a title="Entity Reference" href="#dt-entref">entity | ||
929 | reference</a> or, in the case of an unparsed entity, in the value of | ||
930 | an <b>ENTITY</b> or <b>ENTITIES</b> attribute. If the same entity is declared | ||
931 | more than once, the first declaration encountered is binding; at user option, | ||
932 | an XML processor <em class="rfc2119" title="Keyword in RFC 2119 context">MAY</em> issue a warning if entities are declared multiple times.</p><div class="div3"> <h4><a name="sec-internal-ent" id="sec-internal-ent" />4.2.1 Internal Entities</h4><p>[<a name="dt-internent" id="dt-internent" title="Internal Entity Replacement Text">Definition</a>: If the | ||
933 | entity definition is an <a href="#NT-EntityValue">EntityValue</a>, the defined | ||
934 | entity is called an <b>internal entity</b>. There is no separate physical | ||
935 | storage object, and the content of the entity is given in the declaration.] | ||
936 | Note that some processing of entity and character references in the <a title="Literal Entity Value" href="#dt-litentval">literal entity value</a> may be required to produce | ||
937 | the correct <a title="Replacement Text" href="#dt-repltext">replacement text</a>: see <a href="#intern-replacement"><b>4.5 Construction of Entity Replacement Text</b></a>.</p><p>An internal entity is a <a title="Text Entity" href="#dt-parsedent">parsed entity</a>.</p><p>Example of an internal entity declaration:</p><div class="exampleInner"><pre><!ENTITY Pub-Status "This is a pre-release of the | ||
938 | specification."></pre></div></div><div class="div3"> <h4><a name="sec-external-ent" id="sec-external-ent" />4.2.2 External Entities</h4><p>[<a name="dt-extent" id="dt-extent" title="External Entity">Definition</a>: If the entity is not internal, | ||
939 | it is an <b>external entity</b>, declared as follows:]</p> <h5><a name="IDAUIEU" id="IDAUIEU" />External Entity Declaration</h5><table class="scrap" summary="Scrap"><tbody><tr valign="baseline"><td><a name="NT-ExternalID" id="NT-ExternalID" />[75] </td><td><code>ExternalID</code></td><td> ::= </td><td><code>'SYSTEM' <a href="#NT-S">S</a> <a href="#NT-SystemLiteral">SystemLiteral</a></code></td></tr><tr valign="baseline"><td /><td /><td /><td><code>| 'PUBLIC' <a href="#NT-S">S</a> <a href="#NT-PubidLiteral">PubidLiteral</a> <a href="#NT-S">S</a> <a href="#NT-SystemLiteral">SystemLiteral</a></code></td></tr></tbody><tbody><tr valign="baseline"><td><a name="NT-NDataDecl" id="NT-NDataDecl" />[76] </td><td><code>NDataDecl</code></td><td> ::= </td><td><code><a href="#NT-S">S</a> 'NDATA' <a href="#NT-S">S</a> <a href="#NT-Name">Name</a></code></td><td><a href="#not-declared">[VC: Notation Declared]</a></td></tr></tbody></table><p>If the <a href="#NT-NDataDecl">NDataDecl</a> is present, this is a general <a title="Unparsed Entity" href="#dt-unparsed">unparsed entity</a>; otherwise it is a parsed entity.</p><div class="constraint"><p class="prefix"><a name="not-declared" id="not-declared" /><b>Validity constraint: Notation Declared</b></p><p>The <a href="#NT-Name">Name</a> <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> match the declared name of a <a title="Notation" href="#dt-notation">notation</a>.</p></div><p>[<a name="dt-sysid" id="dt-sysid" title="System Identifier">Definition</a>: The <a href="#NT-SystemLiteral">SystemLiteral</a> is called the entity's <b>system | ||
940 | identifier</b>. It is <span>meant to be | ||
941 | converted to</span> a URI reference | ||
942 | (as defined in <a href="#rfc2396">[IETF RFC 2396]</a>, updated by <a href="#rfc2732">[IETF RFC 2732]</a>), | ||
943 | <span>as part of the | ||
944 | process of dereferencing it</span> to obtain input for the XML processor to construct the | ||
945 | entity's replacement text.] It is an error for a fragment identifier | ||
946 | (beginning with a <code>#</code> character) to be part of a system identifier. | ||
947 | Unless otherwise provided by information outside the scope of this specification | ||
948 | (e.g. a special XML element type defined by a particular DTD, or a processing | ||
949 | instruction defined by a particular application specification), relative URIs | ||
950 | are relative to the location of the resource within which the entity declaration | ||
951 | occurs. <span>This is defined to | ||
952 | be the external entity containing the '<' which starts the declaration, at the | ||
953 | point when it is parsed as a declaration.</span> | ||
954 | A URI might thus be relative to the <a title="Document Entity" href="#dt-docent">document | ||
955 | entity</a>, to the entity containing the <a title="Document Type Declaration" href="#dt-doctype">external | ||
956 | DTD subset</a>, or to some other <a title="External Entity" href="#dt-extent">external parameter | ||
957 | entity</a>. <span>Attempts to | ||
958 | retrieve the resource identified by a URI <em class="rfc2119" title="Keyword in RFC 2119 context">MAY</em> be redirected at the parser | ||
959 | level (for example, in an entity resolver) or below (at the protocol level, | ||
960 | for example, via an HTTP <code>Location:</code> header). In the absence of additional | ||
961 | information outside the scope of this specification within the resource, | ||
962 | the base URI of a resource is always the URI of the actual resource returned. | ||
963 | In other words, it is the URI of the resource retrieved after all redirection | ||
964 | has occurred.</span></p><p>System | ||
965 | identifiers (and other XML strings meant to be used as URI references) <em class="rfc2119" title="Keyword in RFC 2119 context">MAY</em> contain | ||
966 | characters that, according to <a href="#rfc2396">[IETF RFC 2396]</a> and <a href="#rfc2732">[IETF RFC 2732]</a>, | ||
967 | must be escaped before a URI can be used to retrieve the referenced resource. The | ||
968 | characters to be escaped are the control characters #x0 to #x1F and #x7F (most of | ||
969 | which cannot appear in XML), space #x20, the delimiters '<' #x3C, '>' #x3E and | ||
970 | '"' #x22, the <em>unwise</em> characters '{' #x7B, '}' #x7D, '|' #x7C, '\' #x5C, '^' #x5E and | ||
971 | '`' #x60, as well as all characters above #x7F. Since escaping is not always a fully | ||
972 | reversible process, it <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> be performed only when absolutely necessary and as late | ||
973 | as possible in a processing chain. In particular, neither the process of converting | ||
974 | a relative URI to an absolute one nor the process of passing a URI reference to a | ||
975 | process or software component responsible for dereferencing it <em class="rfc2119" title="Keyword in RFC 2119 context">SHOULD</em> trigger escaping. | ||
976 | When escaping does occur, it <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> be performed as follows:</p><ol type="1"><li><p>Each | ||
977 | character <span>to be escaped</span> | ||
978 | is <span>represented in</span> | ||
979 | UTF-8 <span><a href="#Unicode">[Unicode]</a></span> | ||
980 | as one or more bytes.</p></li><li><p><span>The resulting bytes</span> | ||
981 | are escaped with | ||
982 | the URI escaping mechanism (that is, converted to <code>%</code><var>HH</var>, | ||
983 | where HH is the hexadecimal notation of the byte value).</p></li><li><p>The original character is replaced by the resulting character sequence.</p></li></ol><p>[<a name="dt-pubid" id="dt-pubid" title="Public identifier">Definition</a>: In addition to a system | ||
984 | identifier, an external identifier <em class="rfc2119" title="Keyword in RFC 2119 context">MAY</em> include a <b>public identifier</b>.] | ||
985 | An XML processor attempting to retrieve the entity's content <em class="rfc2119" title="Keyword in RFC 2119 context">MAY</em> use | ||
986 | <span>any combination of | ||
987 | the public and system identifiers as well as additional information outside the | ||
988 | scope of this specification</span> to try to generate an alternative URI reference. | ||
989 | If the processor is unable to do so, it <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> use the URI | ||
990 | reference specified in the system literal. Before a match is attempted, | ||
991 | all strings of white space in the public identifier <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> be normalized to | ||
992 | single space characters (#x20), and leading and trailing white space <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> | ||
993 | be removed.</p><p>Examples of external entity declarations:</p><div class="exampleInner"><pre><!ENTITY open-hatch | ||
994 | SYSTEM "http://www.textuality.com/boilerplate/OpenHatch.xml"> | ||
995 | <!ENTITY open-hatch | ||
996 | PUBLIC "-//Textuality//TEXT Standard open-hatch boilerplate//EN" | ||
997 | "http://www.textuality.com/boilerplate/OpenHatch.xml"> | ||
998 | <!ENTITY hatch-pic | ||
999 | SYSTEM "../grafix/OpenHatch.gif" | ||
1000 | NDATA gif ></pre></div></div></div><div class="div2"> <h3><a name="TextEntities" id="TextEntities" />4.3 Parsed Entities</h3><div class="div3"> <h4><a name="sec-TextDecl" id="sec-TextDecl" />4.3.1 The Text Declaration</h4><p>External parsed entities <em class="rfc2119" title="Keyword in RFC 2119 context">SHOULD</em> each begin with a <b>text declaration</b>.</p> <h5><a name="IDAUPEU" id="IDAUPEU" />Text Declaration</h5><table class="scrap" summary="Scrap"><tbody><tr valign="baseline"><td><a name="NT-TextDecl" id="NT-TextDecl" />[77] </td><td><code>TextDecl</code></td><td> ::= </td><td><code>'<?xml' <a href="#NT-VersionInfo">VersionInfo</a>? <a href="#NT-EncodingDecl">EncodingDecl</a> <a href="#NT-S">S</a>? '?>'</code></td></tr></tbody></table><p>The text declaration <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> be provided literally, not by reference | ||
1001 | to a parsed entity. <span class="mustard">The</span> text declaration | ||
1002 | <span class="mustard"><em class="rfc2119" title="Keyword in RFC 2119 context">MUST NOT</em></span> appear at any | ||
1003 | position other than the beginning of an external parsed entity. The text declaration | ||
1004 | in an external parsed entity is not considered part of its <a title="Replacement Text" href="#dt-repltext">replacement text</a>.</p></div><div class="div3"> <h4><a name="wf-entities" id="wf-entities" />4.3.2 Well-Formed Parsed Entities</h4><p>The document entity is well-formed if it matches the production labeled <a href="#NT-document">document</a>. An external general parsed entity is well-formed | ||
1005 | if it matches the production labeled <a href="#NT-extParsedEnt">extParsedEnt</a>. All | ||
1006 | external parameter entities are well-formed by definition.</p> <h5><a name="IDA2REU" id="IDA2REU" />Well-Formed External Parsed Entity</h5><table class="scrap" summary="Scrap"><tbody><tr valign="baseline"><td><a name="NT-extParsedEnt" id="NT-extParsedEnt" />[78] </td><td><code>extParsedEnt</code></td><td> ::= </td><td><code><a href="#NT-TextDecl">TextDecl</a>? <a href="#NT-content">content</a> - <a href="#NT-Char">Char</a>* <a href="#NT-RestrictedChar">RestrictedChar</a> <a href="#NT-Char">Char</a>*</code></td></tr></tbody></table><p>An internal general parsed entity is well-formed if its replacement text | ||
1007 | matches the production labeled <a href="#NT-content">content</a>. All internal | ||
1008 | parameter entities are well-formed by definition.</p><p>A consequence of well-formedness in <span>general</span> | ||
1009 | entities is that the logical and physical | ||
1010 | structures in an XML document are properly nested; no <a title="Start-Tag" href="#dt-stag">start-tag</a>, <a title="End Tag" href="#dt-etag">end-tag</a>, <a title="Empty" href="#dt-empty">empty-element tag</a>, <a title="Element" href="#dt-element">element</a>, <a title="Comment" href="#dt-comment">comment</a>, <a title="Processing instruction" href="#dt-pi">processing instruction</a>, <a title="Character Reference" href="#dt-charref">character | ||
1011 | reference</a>, or <a title="Entity Reference" href="#dt-entref">entity reference</a> | ||
1012 | can begin in one entity and end in another.</p></div><div class="div3"> <h4><a name="charencoding" id="charencoding" />4.3.3 Character Encoding in Entities</h4><p>Each external parsed entity in an XML document <em class="rfc2119" title="Keyword in RFC 2119 context">MAY</em> use a different encoding | ||
1013 | for its characters. All XML processors <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> be able to read entities in both | ||
1014 | the UTF-8 and UTF-16 encodings. The terms "UTF-8" | ||
1015 | and "UTF-16" in this specification do not apply to character | ||
1016 | encodings with any other labels, even if the encodings or labels are very | ||
1017 | similar to UTF-8 or UTF-16.</p><p>Entities encoded in UTF-16 <span><em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em></span> <span>and entities | ||
1018 | encoded in UTF-8 <em class="rfc2119" title="Keyword in RFC 2119 context">MAY</em></span> begin with the Byte Order Mark described in | ||
1019 | ISO/IEC 10646 <a href="#ISO10646">[ISO/IEC 10646]</a> or Unicode <a href="#Unicode">[Unicode]</a> | ||
1020 | (the ZERO WIDTH NO-BREAK SPACE character, #xFEFF). This is an encoding signature, | ||
1021 | not part of either the markup or the character data of the XML document. XML | ||
1022 | processors <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> be able to use this character to differentiate between UTF-8 | ||
1023 | and UTF-16 encoded documents.</p><p>Although an XML processor is required to read only entities in the UTF-8 | ||
1024 | and UTF-16 encodings, it is recognized that other encodings are used around | ||
1025 | the world, and it may be desired for XML processors to read entities that | ||
1026 | use them. In | ||
1027 | the absence of external character encoding information (such as MIME headers), | ||
1028 | parsed entities which are stored in an encoding other than UTF-8 or UTF-16 | ||
1029 | <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> begin with a text declaration (see <a href="#sec-TextDecl"><b>4.3.1 The Text Declaration</b></a>) containing | ||
1030 | an encoding declaration:</p> <h5><a name="IDARVEU" id="IDARVEU" />Encoding Declaration</h5><table class="scrap" summary="Scrap"><tbody><tr valign="baseline"><td><a name="NT-EncodingDecl" id="NT-EncodingDecl" />[80] </td><td><code>EncodingDecl</code></td><td> ::= </td><td><code><a href="#NT-S">S</a> 'encoding' <a href="#NT-Eq">Eq</a> | ||
1031 | ('"' <a href="#NT-EncName">EncName</a> '"' | "'" <a href="#NT-EncName">EncName</a> | ||
1032 | "'" ) </code></td></tr></tbody><tbody><tr valign="baseline"><td><a name="NT-EncName" id="NT-EncName" />[81] </td><td><code>EncName</code></td><td> ::= </td><td><code>[A-Za-z] ([A-Za-z0-9._] | '-')*</code></td><td><i>/* Encoding | ||
1033 | name contains only Latin characters */</i></td></tr></tbody></table><p>In the <a title="Document Entity" href="#dt-docent">document entity</a>, the encoding | ||
1034 | declaration is part of the <a title="XML Declaration" href="#dt-xmldecl">XML declaration</a>. | ||
1035 | The <a href="#NT-EncName">EncName</a> is the name of the encoding used.</p><p>In an encoding declaration, the values "<code>UTF-8</code>", "<code>UTF-16</code>", | ||
1036 | "<code>ISO-10646-UCS-2</code>", and "<code>ISO-10646-UCS-4</code>" <em class="rfc2119" title="Keyword in RFC 2119 context">SHOULD</em> be used | ||
1037 | for the various encodings and transformations of Unicode / ISO/IEC 10646, | ||
1038 | the values "<code>ISO-8859-1</code>", "<code>ISO-8859-2</code>", | ||
1039 | ... "<code>ISO-8859-</code><var>n</var>" (where <var>n</var> | ||
1040 | is the part number) <em class="rfc2119" title="Keyword in RFC 2119 context">SHOULD</em> be used for the parts of ISO 8859, and | ||
1041 | the values "<code>ISO-2022-JP</code>", "<code>Shift_JIS</code>", | ||
1042 | and "<code>EUC-JP</code>" <em class="rfc2119" title="Keyword in RFC 2119 context">SHOULD</em> be used for the various encoded | ||
1043 | forms of JIS X-0208-1997. It | ||
1044 | is <em class="rfc2119" title="Keyword in RFC 2119 context">RECOMMENDED</em> that character encodings registered (as <em>charset</em>s) | ||
1045 | with the Internet Assigned Numbers Authority <a href="#IANA">[IANA-CHARSETS]</a>, | ||
1046 | other than those just listed, be referred to using their registered names; | ||
1047 | other encodings <em class="rfc2119" title="Keyword in RFC 2119 context">SHOULD</em> use names starting with an "x-" prefix. | ||
1048 | XML processors <em class="rfc2119" title="Keyword in RFC 2119 context">SHOULD</em> match character encoding names in a case-insensitive | ||
1049 | way and <em class="rfc2119" title="Keyword in RFC 2119 context">SHOULD</em> either interpret an IANA-registered name as the encoding registered | ||
1050 | at IANA for that name or treat it as unknown (processors are, of course, not | ||
1051 | required to support all IANA-registered encodings).</p><p>In the absence of information provided by an external transport protocol | ||
1052 | (e.g. HTTP or MIME), it is a <a title="Fatal Error" href="#dt-fatal">fatal error</a> for | ||
1053 | an entity including an encoding declaration to be presented to the XML processor | ||
1054 | in an encoding other than that named in the declaration, or for an entity which | ||
1055 | begins with neither a Byte Order Mark | ||
1056 | nor an encoding declaration to use an encoding other than UTF-8. Note that | ||
1057 | since ASCII is a subset of UTF-8, ordinary ASCII entities do not strictly | ||
1058 | need an encoding declaration.</p><p>It is a <a title="Fatal Error" href="#dt-fatal">fatal error</a> for a <a href="#NT-TextDecl">TextDecl</a> to occur other | ||
1059 | than at the beginning of an external entity.</p><p>It is a <a title="Fatal Error" href="#dt-fatal">fatal error</a> when an XML processor | ||
1060 | encounters an entity with an encoding that it is unable to process. It | ||
1061 | is a <a title="Fatal Error" href="#dt-fatal">fatal error</a> if an XML entity is determined (via default, encoding declaration, | ||
1062 | or higher-level protocol) to be in a certain encoding but contains <span>byte</span> | ||
1063 | sequences that are not legal in that encoding. <span>Specifically, it is a | ||
1064 | fatal error if an entity encoded in UTF-8 contains any irregular code unit sequences, | ||
1065 | as defined in Unicode <a href="#Unicode">[Unicode]</a>.</span> <span>Unless an encoding | ||
1066 | is determined by a higher-level protocol, </span>it is also a <a title="Fatal Error" href="#dt-fatal">fatal error</a> if an XML entity | ||
1067 | contains no encoding declaration and its content is not legal UTF-8 or UTF-16.</p><p>Examples of text declarations containing encoding declarations:</p><div class="exampleInner"><pre><?xml encoding='UTF-8'?> | ||
1068 | <?xml encoding='EUC-JP'?></pre></div></div><div class="div3"> <h4><a name="sec-version-info" id="sec-version-info" />4.3.4 Version Information in Entities</h4><p>Each entity, including the <a title="Document Entity" href="#dt-docent">document entity</a>, | ||
1069 | can be separately | ||
1070 | declared as XML 1.0 or XML 1.1. The version declaration appearing | ||
1071 | in the document entity determines the version of the document as a | ||
1072 | whole. An XML 1.1 document may invoke XML 1.0 external entities, so | ||
1073 | that otherwise duplicated versions of external entities, | ||
1074 | particularly DTD external subsets, need not be maintained. However, | ||
1075 | in such a case the rules of XML 1.1 are applied to the entire | ||
1076 | document.</p><p> If an entity (including the document entity) is not labeled | ||
1077 | with a version number, it is treated as if labeled as version | ||
1078 | 1.0.</p></div></div><div class="div2"> <h3><a name="entproc" id="entproc" />4.4 XML Processor Treatment of Entities and References</h3><p>The table below summarizes the contexts in which character references, | ||
1079 | entity references, and invocations of unparsed entities might appear and the | ||
1080 | <em class="rfc2119" title="Keyword in RFC 2119 context">REQUIRED</em> behavior of an <a title="XML Processor" href="#dt-xml-proc">XML processor</a> | ||
1081 | in each case. The labels in the leftmost column describe the recognition context: </p><dl><dt class="label">Reference in Content</dt><dd><p>as a reference anywhere after the <a title="Start-Tag" href="#dt-stag">start-tag</a> | ||
1082 | and before the <a title="End Tag" href="#dt-etag">end-tag</a> of an element; corresponds | ||
1083 | to the nonterminal <a href="#NT-content">content</a>.</p></dd><dt class="label">Reference in Attribute Value</dt><dd><p>as a reference within either the value of an attribute in a <a title="Start-Tag" href="#dt-stag">start-tag</a>, | ||
1084 | or a default value in an <a title="Attribute-List Declaration" href="#dt-attdecl">attribute declaration</a>; | ||
1085 | corresponds to the nonterminal <a href="#NT-AttValue">AttValue</a>.</p></dd><dt class="label">Occurs as Attribute Value</dt><dd><p>as a <a href="#NT-Name">Name</a>, not a reference, appearing either as | ||
1086 | the value of an attribute which has been declared as type <b>ENTITY</b>, | ||
1087 | or as one of the space-separated tokens in the value of an attribute which | ||
1088 | has been declared as type <b>ENTITIES</b>.</p></dd><dt class="label">Reference in Entity Value</dt><dd><p>as a reference within a parameter or internal entity's <a title="Literal Entity Value" href="#dt-litentval">literal | ||
1089 | entity value</a> in the entity's declaration; corresponds to the nonterminal <a href="#NT-EntityValue">EntityValue</a>.</p></dd><dt class="label">Reference in DTD</dt><dd><p>as a reference within either the internal or external subsets of the <a title="Document Type Declaration" href="#dt-doctype">DTD</a>, but outside of an <a href="#NT-EntityValue">EntityValue</a>, <a href="#NT-AttValue">AttValue</a>, <a href="#NT-PI">PI</a>, <a href="#NT-Comment">Comment</a>, <a href="#NT-SystemLiteral">SystemLiteral</a>, <a href="#NT-PubidLiteral">PubidLiteral</a>, | ||
1090 | or the contents of an ignored conditional section (see <a href="#sec-condition-sect"><b>3.4 Conditional Sections</b></a>).</p><p>.</p></dd></dl><p></p><table border="1" frame="border" cellpadding="7" summary="Entity type/reference matrix"><tbody align="center"><tr><td rowspan="2" colspan="1"></td><td colspan="4" align="center" valign="bottom" rowspan="1">Entity | ||
1091 | Type</td><td rowspan="2" align="center" colspan="1">Character</td></tr><tr align="center" valign="bottom"><td rowspan="1" colspan="1">Parameter</td><td rowspan="1" colspan="1">Internal General</td><td rowspan="1" colspan="1">External Parsed | ||
1092 | General</td><td rowspan="1" colspan="1">Unparsed</td></tr><tr align="center" valign="middle"><td align="right" rowspan="1" colspan="1">Reference | ||
1093 | in Content</td><td rowspan="1" colspan="1"><a href="#not-recognized"><cite>Not recognized</cite></a></td><td rowspan="1" colspan="1"><a href="#included"><cite>Included</cite></a></td><td rowspan="1" colspan="1"><a href="#include-if-valid"><cite>Included | ||
1094 | if validating</cite></a></td><td rowspan="1" colspan="1"><a href="#forbidden"><cite>Forbidden</cite></a></td><td rowspan="1" colspan="1"><a href="#included"><cite>Included</cite></a></td></tr><tr align="center" valign="middle"><td align="right" rowspan="1" colspan="1">Reference in Attribute Value</td><td rowspan="1" colspan="1"><a href="#not-recognized"><cite>Not recognized</cite></a></td><td rowspan="1" colspan="1"><a href="#inliteral"><cite>Included | ||
1095 | in literal</cite></a></td><td rowspan="1" colspan="1"><a href="#forbidden"><cite>Forbidden</cite></a></td><td rowspan="1" colspan="1"><a href="#forbidden"><cite>Forbidden</cite></a></td><td rowspan="1" colspan="1"><a href="#included"><cite>Included</cite></a></td></tr><tr align="center" valign="middle"><td align="right" rowspan="1" colspan="1">Occurs as Attribute | ||
1096 | Value</td><td rowspan="1" colspan="1"><a href="#not-recognized"><cite>Not recognized</cite></a></td><td rowspan="1" colspan="1"><a href="#forbidden"><cite>Forbidden</cite></a></td><td rowspan="1" colspan="1"><a href="#forbidden"><cite>Forbidden</cite></a></td><td rowspan="1" colspan="1"><a href="#notify"><cite>Notify</cite></a></td><td rowspan="1" colspan="1"><a href="#not-recognized"><cite>Not recognized</cite></a></td></tr><tr align="center" valign="middle"><td align="right" rowspan="1" colspan="1">Reference in EntityValue</td><td rowspan="1" colspan="1"><a href="#inliteral"><cite>Included in literal</cite></a></td><td rowspan="1" colspan="1"><a href="#bypass"><cite>Bypassed</cite></a></td><td rowspan="1" colspan="1"><a href="#bypass"><cite>Bypassed</cite></a></td><td rowspan="1" colspan="1"><a href="#error"><cite><span>Error</span></cite></a></td><td rowspan="1" colspan="1"><a href="#included"><cite>Included</cite></a></td></tr><tr align="center" valign="middle"><td align="right" rowspan="1" colspan="1">Reference in DTD</td><td rowspan="1" colspan="1"><a href="#as-PE"><cite>Included as PE</cite></a></td><td rowspan="1" colspan="1"><a href="#forbidden"><cite>Forbidden</cite></a></td><td rowspan="1" colspan="1"><a href="#forbidden"><cite>Forbidden</cite></a></td><td rowspan="1" colspan="1"><a href="#forbidden"><cite>Forbidden</cite></a></td><td rowspan="1" colspan="1"><a href="#forbidden"><cite>Forbidden</cite></a></td></tr></tbody></table><div class="div3"> <h4><a name="not-recognized" id="not-recognized" />4.4.1 Not Recognized</h4><p>Outside the DTD, the <code>%</code> character has no special significance; | ||
1097 | thus, what would be parameter entity references in the DTD are not recognized | ||
1098 | as markup in <a href="#NT-content">content</a>. Similarly, the names of unparsed | ||
1099 | entities are not recognized except when they appear in the value of an appropriately | ||
1100 | declared attribute.</p></div><div class="div3"> <h4><a name="included" id="included" />4.4.2 Included</h4><p>[<a name="dt-include" id="dt-include" title="Include">Definition</a>: An entity is <b>included</b> | ||
1101 | when its <a title="Replacement Text" href="#dt-repltext">replacement text</a> is retrieved | ||
1102 | and processed, in place of the reference itself, as though it were part of | ||
1103 | the document at the location the reference was recognized.] The replacement | ||
1104 | text <em class="rfc2119" title="Keyword in RFC 2119 context">MAY</em> contain both <a title="Character Data" href="#dt-chardata">character data</a> | ||
1105 | and (except for parameter entities) <a title="Markup" href="#dt-markup">markup</a>, | ||
1106 | which <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> be recognized in the usual way. (The string "<code>AT&amp;T;</code>" | ||
1107 | expands to "<code>AT&T;</code>" and the remaining ampersand | ||
1108 | is not recognized as an entity-reference delimiter.) A character reference | ||
1109 | is <b>included</b> when the indicated character is processed in place | ||
1110 | of the reference itself. </p></div><div class="div3"> <h4><a name="include-if-valid" id="include-if-valid" />4.4.3 Included If Validating</h4><p>When an XML processor recognizes a reference to a parsed entity, in order | ||
1111 | to <a title="Validity" href="#dt-valid">validate</a> the document, the processor | ||
1112 | <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> <a title="Include" href="#dt-include">include</a> its replacement text. If | ||
1113 | the entity is external, and the processor is not attempting to validate the | ||
1114 | XML document, the processor <em class="rfc2119" title="Keyword in RFC 2119 context">MAY</em>, but need | ||
1115 | not, include the entity's replacement text. If a non-validating processor | ||
1116 | does not include the replacement text, it <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> inform the application that | ||
1117 | it recognized, but did not read, the entity.</p><p>This rule is based on the recognition that the automatic inclusion provided | ||
1118 | by the SGML and XML entity mechanism, primarily designed to support modularity | ||
1119 | in authoring, is not necessarily appropriate for other applications, in particular | ||
1120 | document browsing. Browsers, for example, when encountering an external parsed | ||
1121 | entity reference, might choose to provide a visual indication of the entity's | ||
1122 | presence and retrieve it for display only on demand.</p></div><div class="div3"> <h4><a name="forbidden" id="forbidden" />4.4.4 Forbidden</h4><p>The following are forbidden, and constitute <a title="Fatal Error" href="#dt-fatal">fatal | ||
1123 | errors</a>:</p><ul><li><p>the appearance of a reference to an <a title="Unparsed Entity" href="#dt-unparsed">unparsed | ||
1124 | entity</a><span>, except in the | ||
1125 | <a href="#NT-EntityValue">EntityValue</a> in an entity declaration</span>.</p></li><li><p>the appearance of any character or general-entity reference in the | ||
1126 | DTD except within an <a href="#NT-EntityValue">EntityValue</a> or <a href="#NT-AttValue">AttValue</a>.</p></li><li><p>a reference to an external entity in an attribute value.</p></li></ul></div><div class="div3"> <h4><a name="inliteral" id="inliteral" />4.4.5 Included in Literal</h4><p>When an <a title="Entity Reference" href="#dt-entref">entity reference</a> appears in | ||
1127 | an attribute value, or a parameter entity reference appears in a literal entity | ||
1128 | value, its <a title="Replacement Text" href="#dt-repltext">replacement text</a> <span class="mustard"><em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> be</span> processed | ||
1129 | in place of the reference itself as though it were part of the document at | ||
1130 | the location the reference was recognized, except that a single or double | ||
1131 | quote character in the replacement text <span class="mustard"><em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> always be</span> treated as a normal data | ||
1132 | character and <span class="mustard"><em class="rfc2119" title="Keyword in RFC 2119 context">MUST NOT</em></span> terminate the literal. For example, this is well-formed:</p><div class="exampleInner"><pre><!ENTITY % YN '"Yes"' > | ||
1133 | <!ENTITY WhatHeSaid "He said %YN;" ></pre></div><p>while this is not:</p><div class="exampleInner"><pre><!ENTITY EndAttr "27'" > | ||
1134 | <element attribute='a-&EndAttr;></pre></div></div><div class="div3"> <h4><a name="notify" id="notify" />4.4.6 Notify</h4><p>When the name of an <a title="Unparsed Entity" href="#dt-unparsed">unparsed entity</a> | ||
1135 | appears as a token in the value of an attribute of declared type <b>ENTITY</b> | ||
1136 | or <b>ENTITIES</b>, a validating processor <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> inform the application of | ||
1137 | the <a title="System Identifier" href="#dt-sysid">system</a> and <a title="Public identifier" href="#dt-pubid">public</a> | ||
1138 | (if any) identifiers for both the entity and its associated <a title="Notation" href="#dt-notation">notation</a>.</p></div><div class="div3"> <h4><a name="bypass" id="bypass" />4.4.7 Bypassed</h4><p>When a general entity reference appears in the <a href="#NT-EntityValue">EntityValue</a> | ||
1139 | in an entity declaration, it <span class="mustard"><em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> be</span> bypassed and left as is.</p></div><div class="div3"> <h4><a name="as-PE" id="as-PE" />4.4.8 Included as PE</h4><p>Just as with external parsed entities, parameter entities need only be <a href="#include-if-valid"><cite>included if validating</cite></a>. When a parameter-entity | ||
1140 | reference is recognized in the DTD and included, its <a title="Replacement Text" href="#dt-repltext">replacement | ||
1141 | text</a> <span class="mustard"><em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> be</span> enlarged by the attachment of one leading and one following | ||
1142 | space (#x20) character; the intent is to constrain the replacement text of | ||
1143 | parameter entities to contain an integral number of grammatical tokens in | ||
1144 | the DTD. This | ||
1145 | behavior <span class="mustard"><em class="rfc2119" title="Keyword in RFC 2119 context">MUST NOT</em></span> apply to parameter entity references within entity values; | ||
1146 | these are described in <a href="#inliteral"><b>4.4.5 Included in Literal</b></a>.</p></div><div class="div3"> <h4><a name="error" id="error" />4.4.9 Error</h4><p>It is an <a title="Error" href="#dt-error">error</a> for a reference to | ||
1147 | an unparsed entity to appear in the <a href="#NT-EntityValue">EntityValue</a> in an | ||
1148 | entity declaration.</p></div></div><div class="div2"> <h3><a name="intern-replacement" id="intern-replacement" />4.5 Construction of Entity Replacement Text</h3><p>In discussing the treatment of entities, it is useful to distinguish | ||
1149 | two forms of the entity's value. | ||
1150 | [<a name="dt-litentval" id="dt-litentval" title="Literal Entity Value">Definition</a>: <span>For an | ||
1151 | internal entity, </span>the <b>literal | ||
1152 | entity value</b> is the quoted string actually present in the entity declaration, | ||
1153 | corresponding to the non-terminal <a href="#NT-EntityValue">EntityValue</a>.] [<a name="dt-extlitentval" id="dt-extlitentval" title="Literal Entity Value">Definition</a>: For an external entity, the <b>literal | ||
1154 | entity value</b> is the exact text contained in the entity.] [<a name="dt-repltext" id="dt-repltext" title="Replacement Text">Definition</a>: <span>For an | ||
1155 | internal entity, </span>the <b>replacement text</b> | ||
1156 | is the content of the entity, after replacement of character references and | ||
1157 | parameter-entity references.] [<a name="dt-extrepltext" id="dt-extrepltext" title="Replacement Text">Definition</a>: For | ||
1158 | an external entity, the <b>replacement text</b> is the content of the entity, | ||
1159 | after stripping the text declaration (leaving any surrounding white space) if there | ||
1160 | is one but without any replacement of character references or parameter-entity | ||
1161 | references.]</p><p>The literal entity value as given in an internal entity declaration (<a href="#NT-EntityValue">EntityValue</a>) <em class="rfc2119" title="Keyword in RFC 2119 context">MAY</em> contain character, parameter-entity, | ||
1162 | and general-entity references. Such references <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> be contained entirely | ||
1163 | within the literal entity value. The actual replacement text that is <a title="Include" href="#dt-include">included</a><span> (or <a title="" href="#inliteral">included in literal</a>)</span> as described above | ||
1164 | <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> contain the <em>replacement | ||
1165 | text</em> of any parameter entities referred to, and <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> contain the character | ||
1166 | referred to, in place of any character references in the literal entity value; | ||
1167 | however, general-entity references <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> be left as-is, unexpanded. For example, | ||
1168 | given the following declarations:</p><div class="exampleInner"><pre><!ENTITY % pub "&#xc9;ditions Gallimard" > | ||
1169 | <!ENTITY rights "All rights reserved" > | ||
1170 | <!ENTITY book "La Peste: Albert Camus, | ||
1171 | &#xA9; 1947 %pub;. &rights;" ></pre></div><p>then the replacement text for the entity "<code>book</code>" | ||
1172 | is:</p><div class="exampleInner"><pre>La Peste: Albert Camus, | ||
1173 | © 1947 Éditions Gallimard. &rights;</pre></div><p>The general-entity reference "<code>&rights;</code>" would | ||
1174 | be expanded should the reference "<code>&book;</code>" appear | ||
1175 | in the document's content or an attribute value.</p><p>These simple rules may have complex interactions; for a detailed discussion | ||
1176 | of a difficult example, see <a href="#sec-entexpand"><b>C Expansion of Entity and Character References</b></a>.</p></div><div class="div2"> <h3><a name="sec-predefined-ent" id="sec-predefined-ent" />4.6 Predefined Entities</h3><p>[<a name="dt-escape" id="dt-escape" title="escape">Definition</a>: Entity and character references <em class="rfc2119" title="Keyword in RFC 2119 context">MAY</em> | ||
1177 | both be used to <b>escape</b> the left angle bracket, ampersand, and | ||
1178 | other delimiters. A set of general entities (<code>amp</code>, | ||
1179 | <code>lt</code>, | ||
1180 | <code>gt</code>, | ||
1181 | <code>apos</code>, | ||
1182 | <code>quot</code>) is specified for | ||
1183 | this purpose. Numeric character references <em class="rfc2119" title="Keyword in RFC 2119 context">MAY</em> also be used; they are expanded | ||
1184 | immediately when recognized and <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> be treated as character data, so the | ||
1185 | numeric character references "<code>&#60;</code>" and "<code>&#38;</code>" <em class="rfc2119" title="Keyword in RFC 2119 context">MAY</em> be used to escape <code><</code> and <code>&</code> when they occur | ||
1186 | in character data.]</p><p>All XML processors <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> recognize these entities whether they are declared | ||
1187 | or not. <a title="For interoperability" href="#dt-interop">For interoperability</a>, valid XML | ||
1188 | documents <em class="rfc2119" title="Keyword in RFC 2119 context">SHOULD</em> declare these entities, like any others, before using them. If | ||
1189 | the entities <code>lt</code> or <code>amp</code> are declared, they <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> be | ||
1190 | declared as internal entities whose replacement text is a character reference | ||
1191 | to the respective | ||
1192 | character (less-than sign or ampersand) being escaped; the double | ||
1193 | escaping is <em class="rfc2119" title="Keyword in RFC 2119 context">REQUIRED</em> for these entities so that references to them produce | ||
1194 | a well-formed result. If the entities <code>gt</code>, <code>apos</code>, | ||
1195 | or <code>quot</code> are declared, they <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> be declared as internal entities | ||
1196 | whose replacement text is the single character being escaped (or a character | ||
1197 | reference to that character; the double escaping here is <span class="mustard"><em class="rfc2119" title="Keyword in RFC 2119 context">OPTIONAL</em></span> but harmless). | ||
1198 | For example:</p><div class="exampleInner"><pre><!ENTITY lt "&#38;#60;"> | ||
1199 | <!ENTITY gt "&#62;"> | ||
1200 | <!ENTITY amp "&#38;#38;"> | ||
1201 | <!ENTITY apos "&#39;"> | ||
1202 | <!ENTITY quot "&#34;"></pre></div></div><div class="div2"> <h3><a name="Notations" id="Notations" />4.7 Notation Declarations</h3><p>[<a name="dt-notation" id="dt-notation" title="Notation">Definition</a>: <b>Notations</b> identify | ||
1203 | by name the format of <a title="External Entity" href="#dt-extent">unparsed entities</a>, | ||
1204 | the format of elements which bear a notation attribute, or the application | ||
1205 | to which a <a title="Processing instruction" href="#dt-pi">processing instruction</a> is addressed.]</p><p>[<a name="dt-notdecl" id="dt-notdecl" title="Notation Declaration">Definition</a>: <b>Notation declarations</b> | ||
1206 | provide a name for the notation, for use in entity and attribute-list declarations | ||
1207 | and in attribute specifications, and an external identifier for the notation | ||
1208 | which may allow an XML processor or its client application to locate a helper | ||
1209 | application capable of processing data in the given notation.]</p> <h5><a name="IDAYTFU" id="IDAYTFU" />Notation Declarations</h5><table class="scrap" summary="Scrap"><tbody><tr valign="baseline"><td><a name="NT-NotationDecl" id="NT-NotationDecl" />[82] </td><td><code>NotationDecl</code></td><td> ::= </td><td><code>'<!NOTATION' <a href="#NT-S">S</a> <a href="#NT-Name">Name</a> <a href="#NT-S">S</a> (<a href="#NT-ExternalID">ExternalID</a> | <a href="#NT-PublicID">PublicID</a>) <a href="#NT-S">S</a>? '>'</code></td><td><a href="#UniqueNotationName">[VC: Unique Notation Name]</a></td></tr></tbody><tbody><tr valign="baseline"><td><a name="NT-PublicID" id="NT-PublicID" />[83] </td><td><code>PublicID</code></td><td> ::= </td><td><code>'PUBLIC' <a href="#NT-S">S</a> <a href="#NT-PubidLiteral">PubidLiteral</a></code></td></tr></tbody></table><div class="constraint"><p class="prefix"><a name="UniqueNotationName" id="UniqueNotationName" /><b>Validity constraint: Unique Notation Name</b></p><p><span class="mustard">A given <a href="#NT-Name">Name</a> <em class="rfc2119" title="Keyword in RFC 2119 context">MUST NOT</em> be declared in more than one notation declaration.</span></p></div><p>XML processors <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> provide applications with the name and external identifier(s) | ||
1210 | of any notation declared and referred to in an attribute value, attribute | ||
1211 | definition, or entity declaration. They <em class="rfc2119" title="Keyword in RFC 2119 context">MAY</em> additionally resolve the external | ||
1212 | identifier into the <a title="System Identifier" href="#dt-sysid">system identifier</a>, file | ||
1213 | name, or other information needed to allow the application to call a processor | ||
1214 | for data in the notation described. (It is not an error, however, for XML | ||
1215 | documents to declare and refer to notations for which notation-specific applications | ||
1216 | are not available on the system where the XML processor or application is | ||
1217 | running.)</p></div><div class="div2"> <h3><a name="sec-doc-entity" id="sec-doc-entity" />4.8 Document Entity</h3><p>[<a name="dt-docent" id="dt-docent" title="Document Entity">Definition</a>: The <b>document entity</b> | ||
1218 | serves as the root of the entity tree and a starting-point for an <a title="XML Processor" href="#dt-xml-proc">XML processor</a>.] This specification does | ||
1219 | not specify how the document entity is to be located by an XML processor; | ||
1220 | unlike other entities, the document entity has no name and might well appear | ||
1221 | on a processor input stream without any identification at all.</p></div></div><div class="div1"> <h2><a name="sec-conformance" id="sec-conformance" />5 Conformance</h2><div class="div2"> <h3><a name="proc-types" id="proc-types" />5.1 Validating and Non-Validating Processors</h3><p>Conforming <a title="XML Processor" href="#dt-xml-proc">XML processors</a> fall into | ||
1222 | two classes: validating and non-validating.</p><p>Validating and non-validating processors alike <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> report violations of | ||
1223 | this specification's well-formedness constraints in the content of the <a title="Document Entity" href="#dt-docent">document entity</a> and any other <a title="Text Entity" href="#dt-parsedent">parsed | ||
1224 | entities</a> that they read.</p><p>[<a name="dt-validating" id="dt-validating" title="Validating Processor">Definition</a>: <b>Validating | ||
1225 | processors</b> <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em>, | ||
1226 | at user option, report violations of the constraints expressed by | ||
1227 | the declarations in the <a title="Document Type Declaration" href="#dt-doctype">DTD</a>, and failures | ||
1228 | to fulfill the validity constraints given in this specification.] | ||
1229 | To accomplish this, validating XML processors <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> read and process the entire | ||
1230 | DTD and all external parsed entities referenced in the document.</p><p>Non-validating processors are <em class="rfc2119" title="Keyword in RFC 2119 context">REQUIRED</em> to check only the <a title="Document Entity" href="#dt-docent">document | ||
1231 | entity</a>, including the entire internal DTD subset, for well-formedness. [<a name="dt-use-mdecl" id="dt-use-mdecl" title="Process Declarations">Definition</a>: While they are not required | ||
1232 | to check the document for validity, they are <em class="rfc2119" title="Keyword in RFC 2119 context">REQUIRED</em> to <b>process</b> | ||
1233 | all the declarations they read in the internal DTD subset and in any parameter | ||
1234 | entity that they read, up to the first reference to a parameter entity that | ||
1235 | they do <em>not</em> read; that is to say, they <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> use the information | ||
1236 | in those declarations to <a href="#AVNormalize"><cite>normalize</cite></a> | ||
1237 | attribute values, <a href="#included"><cite>include</cite></a> the replacement | ||
1238 | text of internal entities, and supply <a href="#sec-attr-defaults"><cite>default | ||
1239 | attribute values</cite></a>.] Except when <code>standalone="yes"</code>, they | ||
1240 | <em class="rfc2119" title="Keyword in RFC 2119 context">MUST NOT</em> <a title="Process Declarations" href="#dt-use-mdecl">process</a> <a title="entity declaration" href="#dt-entdecl">entity | ||
1241 | declarations</a> or <a title="Attribute-List Declaration" href="#dt-attdecl">attribute-list declarations</a> | ||
1242 | encountered after a reference to a parameter entity that is not read, since | ||
1243 | the entity may have contained overriding declarations<span>; when <code>standalone="yes"</code>, processors <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> | ||
1244 | process these declarations</span>.</p><p>Note | ||
1245 | that when processing invalid documents with a non-validating | ||
1246 | processor the application may not be presented with consistent | ||
1247 | information. For example, several requirements for uniqueness | ||
1248 | within the document may not be met, including more than one element | ||
1249 | with the same id, duplicate declarations of elements or notations | ||
1250 | with the same name, etc. In these cases the behavior of the parser | ||
1251 | with respect to reporting such information to the application is | ||
1252 | undefined.</p><p>XML 1.1 processors <em class="rfc2119" title="Keyword in RFC 2119 context">MUST</em> be able to process both XML 1.0 | ||
1253 | and XML 1.1 documents. Programs which generate XML <em class="rfc2119" title="Keyword in RFC 2119 context">SHOULD</em> | ||
1254 | generate XML 1.0, unless one of the specific features of XML 1.1 is required.</p></div><div class="div2"> <h3><a name="safe-behavior" id="safe-behavior" />5.2 Using XML Processors</h3><p>The behavior of a validating XML processor is highly predictable; it must | ||
1255 | read every piece of a document and report all well-formedness and validity | ||
1256 | violations. Less is required of a non-validating processor; it need not read | ||
1257 | any part of the document other than the document entity. This has two effects | ||
1258 | that may be important to users of XML processors:</p><ul><li><p>Certain well-formedness errors, specifically those that require reading | ||
1259 | external entities, <span>may fail to</span> be detected by a non-validating processor. Examples | ||
1260 | include the constraints entitled <a href="#wf-entdeclared"><cite>Entity Declared</cite></a>, <a href="#textent"><cite>Parsed Entity</cite></a>, and <a href="#norecursion"><cite>No | ||
1261 | Recursion</cite></a>, as well as some of the cases described as <a href="#forbidden"><cite>forbidden</cite></a> in <a href="#entproc"><b>4.4 XML Processor Treatment of Entities and References</b></a>.</p></li><li><p>The information passed from the processor to the application may | ||
1262 | vary, depending on whether the processor reads parameter and external entities. | ||
1263 | For example, a non-validating processor <span>may fail to</span> <a href="#AVNormalize"><cite>normalize</cite></a> | ||
1264 | attribute values, <a href="#included"><cite>include</cite></a> the replacement | ||
1265 | text of internal entities, or supply <a href="#sec-attr-defaults"><cite>default | ||
1266 | attribute values</cite></a>, where doing so depends on having read declarations | ||
1267 | in external or parameter entities.</p></li></ul><p>For maximum reliability in interoperating between different XML processors, | ||
1268 | applications which use non-validating processors <em class="rfc2119" title="Keyword in RFC 2119 context">SHOULD NOT</em> rely on any behaviors | ||
1269 | not required of such processors. Applications which require DTD facilities | ||
1270 | not related to validation (such | ||
1271 | as the declaration of default attributes and internal entities that are | ||
1272 | or may be specified in | ||
1273 | external entities <em class="rfc2119" title="Keyword in RFC 2119 context">SHOULD</em> use validating XML processors.</p></div></div><div class="div1"> <h2><a name="sec-notation" id="sec-notation" />6 Notation</h2><p>The formal grammar of XML is given in this specification using a simple | ||
1274 | Extended Backus-Naur Form (EBNF) notation. Each rule in the grammar defines | ||
1275 | one symbol, in the form</p><div class="exampleInner"><pre>symbol ::= expression</pre></div><p>Symbols are written with an initial capital letter if they are the | ||
1276 | start symbol of a regular language, otherwise with an initial lowercase | ||
1277 | letter. Literal strings are quoted.</p><p>Within the expression on the right-hand side of a rule, the following expressions | ||
1278 | are used to match strings of one or more characters: </p><dl><dt class="label"><code>#xN</code></dt><dd><p>where <code>N</code> is a hexadecimal integer, the expression matches the character | ||
1279 | <span>whose</span><span> number | ||
1280 | (code point) in</span> ISO/IEC 10646 <span>is <code>N</code></span>. The number of leading zeros in the <code>#xN</code> | ||
1281 | form is insignificant.</p></dd><dt class="label"><code>[a-zA-Z]</code>, <code>[#xN-#xN]</code></dt><dd><p>matches any <a href="#NT-Char">Char</a> with a value in the range(s) indicated (inclusive).</p></dd><dt class="label"><code>[abc]</code>, <code>[#xN#xN#xN]</code></dt><dd><p>matches any <a href="#NT-Char">Char</a> with a value among the characters | ||
1282 | enumerated. Enumerations and ranges can be mixed in one set of brackets.</p></dd><dt class="label"><code>[^a-z]</code>, <code>[^#xN-#xN]</code></dt><dd><p>matches any <a href="#NT-Char">Char</a> with a value <em>outside</em> the range | ||
1283 | indicated.</p></dd><dt class="label"><code>[^abc]</code>, <code>[^#xN#xN#xN]</code></dt><dd><p>matches any <a href="#NT-Char">Char</a> with a value not among the characters given. Enumerations | ||
1284 | and ranges of forbidden values can be mixed in one set of brackets.</p></dd><dt class="label"><code>"string"</code></dt><dd><p>matches a literal string <a title="match" href="#dt-match">matching</a> that | ||
1285 | given inside the double quotes.</p></dd><dt class="label"><code>'string'</code></dt><dd><p>matches a literal string <a title="match" href="#dt-match">matching</a> that | ||
1286 | given inside the single quotes.</p></dd></dl><p> These symbols may be combined to match more complex patterns as follows, | ||
1287 | where <code>A</code> and <code>B</code> represent simple expressions: </p><dl><dt class="label">(<code>expression</code>)</dt><dd><p><code>expression</code> is treated as a unit and may be combined as described | ||
1288 | in this list.</p></dd><dt class="label"><code>A?</code></dt><dd><p>matches <code>A</code> or nothing; optional <code>A</code>.</p></dd><dt class="label"><code>A B</code></dt><dd><p>matches <code>A</code> followed by <code>B</code>. This | ||
1289 | operator has higher precedence than alternation; thus <code>A B | C D</code> | ||
1290 | is identical to <code>(A B) | (C D)</code>.</p></dd><dt class="label"><code>A | B</code></dt><dd><p>matches <code>A</code> or <code>B</code>.</p></dd><dt class="label"><code>A - B</code></dt><dd><p>matches any string that matches <code>A</code> but does not match <code>B</code>.</p></dd><dt class="label"><code>A+</code></dt><dd><p>matches one or more occurrences of <code>A</code>. Concatenation | ||
1291 | has higher precedence than alternation; thus <code>A+ | B+</code> is identical | ||
1292 | to <code>(A+) | (B+)</code>.</p></dd><dt class="label"><code>A*</code></dt><dd><p>matches zero or more occurrences of <code>A</code>. Concatenation | ||
1293 | has higher precedence than alternation; thus <code>A* | B*</code> is identical | ||
1294 | to <code>(A*) | (B*)</code>.</p></dd></dl><p> Other notations used in the productions are: </p><dl><dt class="label"><code>/* ... */</code></dt><dd><p>comment.</p></dd><dt class="label"><code>[ wfc: ... ]</code></dt><dd><p>well-formedness constraint; this identifies by name a constraint on <a title="Well-Formed" href="#dt-wellformed">well-formed</a> documents associated with a production.</p></dd><dt class="label"><code>[ vc: ... ]</code></dt><dd><p>validity constraint; this identifies by name a constraint on <a title="Validity" href="#dt-valid">valid</a> | ||
1295 | documents associated with a production.</p></dd></dl><p></p></div></div><div class="back"><div class="div1"> <h2><a name="sec-bibliography" id="sec-bibliography" />A References</h2><div class="div2"> <h3><a name="sec-existing-stds" id="sec-existing-stds" />A.1 Normative References</h3><dl><dt class="label"><a name="IANA" id="IANA" />IANA-CHARSETS</dt><dd>(Internet | ||
1296 | Assigned Numbers Authority) <a href="http://www.iana.org/assignments/character-sets"><cite>Official Names for Character Sets</cite></a>, | ||
1297 | ed. Keld Simonsen et al. (See http://www.iana.org/assignments/character-sets.)</dd><dt class="label"><a name="rfc2119" id="rfc2119" />IETF RFC 2119</dt><dd>IETF | ||
1298 | (Internet Engineering Task Force). <a href="http://www.ietf.org/rfc/rfc2119.txt"><cite>RFC 2119: Key words for use in RFCs to Indicate Requirement Levels</cite></a>. | ||
1299 | Scott Bradner, 1997. (See http://www.ietf.org/rfc/rfc2119.txt.)</dd><dt class="label"><a name="rfc2396" id="rfc2396" />IETF RFC 2396</dt><dd>IETF | ||
1300 | (Internet Engineering Task Force). <a href="http://www.ietf.org/rfc/rfc2396.txt"><cite>RFC 2396: Uniform Resource Identifiers | ||
1301 | (URI): Generic Syntax</cite></a>. T. Berners-Lee, R. Fielding, L. Masinter. | ||
1302 | 1998. (See http://www.ietf.org/rfc/rfc2396.txt.)</dd><dt class="label"><a name="rfc2732" id="rfc2732" />IETF RFC 2732</dt><dd>IETF | ||
1303 | (Internet Engineering Task Force). <a href="http://www.ietf.org/rfc/rfc2732.txt"><cite>RFC 2732: Format for Literal | ||
1304 | IPv6 Addresses in URL's</cite></a>. R. Hinden, B. Carpenter, L. Masinter. | ||
1305 | 1999. (See http://www.ietf.org/rfc/rfc2732.txt.)</dd><dt class="label"><a name="RFC1766" id="RFC1766" />IETF RFC 3066</dt><dd>IETF | ||
1306 | (Internet Engineering Task Force). <a href="http://www.ietf.org/rfc/rfc3066.txt"><cite>RFC 3066: Tags for the Identification | ||
1307 | of Languages</cite></a>, ed. H. Alvestrand. 2001. (See http://www.ietf.org/rfc/rfc3066.txt.)</dd><dt class="label"><a name="ISO10646" id="ISO10646" />ISO/IEC 10646</dt><dd><span>ISO (International | ||
1308 | Organization for Standardization). <cite>ISO/IEC 10646-1:2000. Information | ||
1309 | technology — Universal Multiple-Octet Coded Character Set (UCS) — | ||
1310 | Part 1: Architecture and Basic Multilingual Plane</cite> and <cite>ISO/IEC 10646-2:2001. | ||
1311 | Information technology — Universal Multiple-Octet Coded Character Set (UCS) — Part 2: | ||
1312 | Supplementary Planes</cite>, as, from time to time, amended, replaced by a new edition or | ||
1313 | expanded by the addition of new parts. [Geneva]: International Organization for Standardization. | ||
1314 | (See <a href="http://www.iso.ch">http://www.iso.ch</a> for the latest version.)</span></dd><dt class="label"><a name="Unicode" id="Unicode" />Unicode</dt><dd>The Unicode Consortium. <em>The Unicode | ||
1315 | Standard, Version 4.0.</em> Reading, Mass.: Addison-Wesley, | ||
1316 | 2003, | ||
1317 | as updated from time to time by the publication of new versions. (See | ||
1318 | <a href="http://www.unicode.org/unicode/standard/versions"> | ||
1319 | http://www.unicode.org/unicode/standard/versions</a> for the latest version | ||
1320 | and additional information on versions of the standard and of the Unicode | ||
1321 | Character Database).</dd><dt class="label"><a name="XML1.0" />XML-1.0</dt><dd>W3C. <a href="http://www.w3.org/TR/REC-xml"><cite>Extensible Markup Language (XML) 1.0 (Third | ||
1322 | Edition)</cite></a>. Tim Bray, Jean Paoli, C.M. Sperberg-McQueen, Eve Maler, François Yergeau | ||
1323 | (editors) (See http://www.w3.org/TR/REC-xml.)</dd></dl></div><div class="div2"> <h3><a name="null" id="null" />A.2 Other References</h3><dl><dt class="label"><a name="Aho" id="Aho" />Aho/Ullman</dt><dd>Aho, Alfred V., Ravi Sethi, and Jeffrey D. | ||
1324 | Ullman. <cite>Compilers: Principles, Techniques, and Tools</cite>. | ||
1325 | Reading: Addison-Wesley, 1986, rpt. corr. 1988.</dd><dt class="label"><a name="ABK" id="ABK" />Brüggemann-Klein</dt><dd>Brüggemann-Klein, | ||
1326 | Anne. <a href="ftp://ftp.informatik.uni-freiburg.de/documents/papers/brueggem/habil.ps"><cite>Formal Models in Document Processing</cite></a>. Habilitationsschrift. Faculty | ||
1327 | of Mathematics at the University of Freiburg, 1993. (See ftp://ftp.informatik.uni-freiburg.de/documents/papers/brueggem/habil.ps.)</dd><dt class="label"><a name="ABKDW" id="ABKDW" />Brüggemann-Klein and Wood</dt><dd>Brüggemann-Klein, | ||
1328 | Anne, and Derick Wood. <cite>Deterministic Regular Languages</cite>. | ||
1329 | Universität Freiburg, Institut für Informatik, Bericht 38, Oktober 1991. Extended | ||
1330 | abstract in A. Finkel, M. Jantzen, Hrsg., STACS 1992, S. 173-184. Springer-Verlag, | ||
1331 | Berlin 1992. Lecture Notes in Computer Science 577. Full version titled <cite>One-Unambiguous | ||
1332 | Regular Languages</cite> in Information and Computation 140 (2): 229-253, | ||
1333 | February 1998.</dd><dt class="label"><a name="Charmod" />Charmod</dt><dd>W3C Working Draft. | ||
1334 | |||
1335 | <a href="http://www.w3.org/TR/2003/WD-charmod-20030822/"><cite>Character Model for the World Wide Web 1.0</cite></a>. | ||
1336 | |||
1337 | Martin J. Dürst, François Yergeau, Richard Ishida, Misha Wolf, Tex Texin. (See http://www.w3.org/TR/2003/WD-charmod-20030822/.)</dd><dt class="label"><a name="Clark" id="Clark" />Clark</dt><dd>James Clark. | ||
1338 | <a href="http://www.w3.org/TR/NOTE-sgml-xml-971215"><cite>Comparison of SGML and XML</cite></a>. (See http://www.w3.org/TR/NOTE-sgml-xml-971215.)</dd><dt class="label"><a name="IANA-LANGCODES" id="IANA-LANGCODES" />IANA-LANGCODES</dt><dd>(Internet | ||
1339 | Assigned Numbers Authority) <a href="http://www.iana.org/assignments/language-tags"><cite>Registry of Language Tags</cite></a>, | ||
1340 | ed. Keld Simonsen et al. (See http://www.iana.org/assignments/language-tags.)</dd><dt class="label"><a name="RFC2141" id="RFC2141" />IETF RFC 2141</dt><dd>IETF | ||
1341 | (Internet Engineering Task Force). <a href="http://www.ietf.org/rfc/rfc2141.txt"><cite>RFC 2141: URN Syntax</cite></a>, ed. | ||
1342 | R. Moats. 1997. (See http://www.ietf.org/rfc/rfc2141.txt.)</dd><dt class="label"><a name="rfc2376" id="rfc2376" />IETF RFC 3023</dt><dd>IETF | ||
1343 | (Internet Engineering Task Force). <a href="http://www.ietf.org/rfc/rfc3023.txt"><cite>RFC 3023: XML Media Types</cite></a>. | ||
1344 | eds. M. Murata, S. St.Laurent, D. Kohn. 2001. (See http://www.ietf.org/rfc/rfc3023.txt.)</dd><dt class="label"><a name="rfc2781" id="rfc2781" />IETF RFC 2781</dt><dd>IETF | ||
1345 | (Internet Engineering Task Force). <a href="http://www.ietf.org/rfc/rfc2781.txt"><cite>RFC 2781: UTF-16, an encoding | ||
1346 | of ISO 10646</cite></a>, ed. P. Hoffman, F. Yergeau. 2000. (See http://www.ietf.org/rfc/rfc2781.txt.)</dd><dt class="label"><a name="ISO639" id="ISO639" />ISO 639</dt><dd>(International Organization for Standardization). | ||
1347 | <cite>ISO 639:1988 (E). | ||
1348 | Code for the representation of names of languages.</cite> [Geneva]: International | ||
1349 | Organization for Standardization, 1988.</dd><dt class="label"><a name="ISO3166" id="ISO3166" />ISO 3166</dt><dd>(International Organization for Standardization). | ||
1350 | <cite>ISO 3166-1:1997 | ||
1351 | (E). Codes for the representation of names of countries and their subdivisions — | ||
1352 | Part 1: Country codes</cite> [Geneva]: International Organization for | ||
1353 | Standardization, 1997.</dd><dt class="label"><a name="ISO8879" id="ISO8879" />ISO 8879</dt><dd>ISO (International Organization for Standardization). <cite>ISO | ||
1354 | 8879:1986(E). Information processing — Text and Office Systems — | ||
1355 | Standard Generalized Markup Language (SGML).</cite> First edition — | ||
1356 | 1986-10-15. [Geneva]: International Organization for Standardization, 1986. </dd><dt class="label"><a name="ISO10744" id="ISO10744" />ISO/IEC 10744</dt><dd>ISO (International Organization for | ||
1357 | Standardization). <cite>ISO/IEC 10744-1992 (E). Information technology — | ||
1358 | Hypermedia/Time-based Structuring Language (HyTime). </cite> [Geneva]: | ||
1359 | International Organization for Standardization, 1992. <em>Extended Facilities | ||
1360 | Annexe.</em> [Geneva]: International Organization for Standardization, 1996. </dd><dt class="label"><a name="websgml" id="websgml" />WEBSGML</dt><dd>ISO | ||
1361 | (International Organization for Standardization). <a href="http://www.sgmlsource.com/8879/n0029.htm"><cite>ISO 8879:1986 | ||
1362 | TC2. Information technology — Document Description and Processing Languages</cite></a>. | ||
1363 | [Geneva]: International Organization for Standardization, 1998. (See http://www.sgmlsource.com/8879/n0029.htm.)</dd><dt class="label"><a name="xml-names" id="xml-names" />XML Names</dt><dd>Tim Bray, | ||
1364 | Dave Hollander, and Andrew Layman, editors. <a href="http://www.w3.org/TR/REC-xml-names/"><cite>Namespaces in XML</cite></a>. | ||
1365 | Textuality, Hewlett-Packard, and Microsoft. World Wide Web Consortium, 1999. (See http://www.w3.org/TR/REC-xml-names/.)</dd></dl></div></div><div class="div1"> <h2><a name="sec-CharNorm" id="sec-CharNorm" />B Definitions for Character Normalization</h2><p>This appendix contains the necessary definitions for character normalization. | ||
1366 | For additional background information and examples, see <a href="#Charmod">[Charmod]</a>.</p><p> | ||
1367 | [<a name="dt-Uni-encform" id="dt-Uni-encform" title="Unicode encoding form">Definition</a>: Text is said to be | ||
1368 | in a <b>Unicode encoding form</b> if it is encoded in | ||
1369 | UTF-8, UTF-16 or UTF-32.]</p><p> | ||
1370 | [<a name="dt-legacyenc" id="dt-legacyenc" title="legacy encoding">Definition</a>: <b>Legacy encoding</b> | ||
1371 | is taken to mean any character encoding not based on Unicode.]</p><p> | ||
1372 | [<a name="dt-normtransc" id="dt-normtransc" title="normalizing transcoder">Definition</a>: A | ||
1373 | <b>normalizing transcoder</b> is a transcoder that converts from a | ||
1374 | <a title="legacy encoding" href="#dt-legacyenc">legacy encoding</a> to a | ||
1375 | <a title="Unicode encoding form" href="#dt-Uni-encform">Unicode encoding form</a> and | ||
1376 | ensures that the result is in Unicode Normalization Form C | ||
1377 | (see UAX #15 <a href="#Unicode">[Unicode]</a>).]</p><p>[<a name="dt-charesc" id="dt-charesc" title="character escape">Definition</a>: A <b>character escape</b> | ||
1378 | is a syntactic device defined in a markup or programming language that allows | ||
1379 | one or more of:]</p><ol type="1"><li><p>expressing syntax-significant characters while disregarding | ||
1380 | their significance in the syntax of the language, or</p></li><li><p>expressing characters not representable in the character encoding | ||
1381 | chosen for an instance of the language, or</p></li><li><p>expressing characters in general, without use of the corresponding | ||
1382 | character codes.</p></li></ol><p> | ||
1383 | [<a name="dt-certified" id="dt-certified" title="certified">Definition</a>: <b>Certified</b> text | ||
1384 | is text which satisfies at least one of the following conditions:]</p><ol type="1"><li><p>it has been confirmed through inspection that the text | ||
1385 | is in normalized form</p></li><li><p>the source text-processing component is identified | ||
1386 | and is known to produce only normalized text.</p></li></ol><p> | ||
1387 | [<a name="dt-uninorm" id="dt-uninorm" title="Unicode-normalized">Definition</a>: Text is, for the purposes of | ||
1388 | this specification, <b>Unicode-normalized</b> if it is in a | ||
1389 | <a title="Unicode encoding form" href="#dt-Uni-encform">Unicode encoding form</a> and is in | ||
1390 | Unicode Normalization Form C, according to a version of Unicode Standard Annex #15: | ||
1391 | Unicode Normalization Forms <a href="#Unicode">[Unicode]</a> at least as recent as the | ||
1392 | oldest version of the Unicode Standard that contains all the characters | ||
1393 | actually present in the text, but no earlier | ||
1394 | than version 3.2.]</p><p> | ||
1395 | [<a name="dt-inclnorm" id="dt-inclnorm" title="include-normalized">Definition</a>: Text is | ||
1396 | <b>include-normalized</b> if:]</p><ol type="1"><li><p>the text is <a title="Unicode-normalized" href="#dt-uninorm">Unicode-normalized</a> | ||
1397 | and does not contain any <a title="character escape" href="#dt-charesc">character escapes</a> | ||
1398 | or <a title="Include" href="#dt-include">includes</a> whose expansion would | ||
1399 | cause the text to become no longer <a title="Unicode-normalized" href="#dt-uninorm">Unicode-normalized</a>; | ||
1400 | or</p></li><li><p>the text is in a <a title="legacy encoding" href="#dt-legacyenc">legacy encoding</a> and, if it were transcoded | ||
1401 | to a <a title="Unicode encoding form" href="#dt-Uni-encform">Unicode encoding form</a> by a | ||
1402 | <a title="normalizing transcoder" href="#dt-normtransc">normalizing transcoder</a>, the resulting | ||
1403 | text would satisfy clause 1 above.</p></li></ol><p> | ||
1404 | [<a name="dt-compchar" id="dt-compchar" title="composing character">Definition</a>: A <b>composing character</b> | ||
1405 | is a character that is one or both of the following:]</p><ol type="1"><li><p>the second character in the canonical decomposition mapping of | ||
1406 | some primary composite (as defined in D3 of UAX #15 <a href="#Unicode">[Unicode]</a>), or</p></li><li><p>of non-zero canonical combining class (as defined in Unicode | ||
1407 | <a href="#Unicode">[Unicode]</a>).</p></li></ol><p> | ||
1408 | [<a name="dt-fullnorm" id="dt-fullnorm" title="fully normalized">Definition</a>: Text is | ||
1409 | <b>fully-normalized</b> if:]</p><ol type="1"><li><p>the text is in a <a title="Unicode encoding form" href="#dt-Uni-encform">Unicode encoding | ||
1410 | form</a>, is <a title="include-normalized" href="#dt-inclnorm">include-normalized</a> and | ||
1411 | none of the <a title="" href="#dt-relconst"><span>relevant</span> | ||
1412 | constructs</a> comprising the text begin with a | ||
1413 | <a title="composing character" href="#dt-compchar">composing character</a> or a | ||
1414 | character escape representing a | ||
1415 | <a title="composing character" href="#dt-compchar">composing character</a>; or</p></li><li><p>the text is in a <a title="legacy encoding" href="#dt-legacyenc">legacy encoding</a> and, | ||
1416 | if it were transcoded to a <a title="Unicode encoding form" href="#dt-Uni-encform">Unicode encoding form</a> | ||
1417 | by a <a title="normalizing transcoder" href="#dt-normtransc">normalizing transcoder</a>, the resulting text | ||
1418 | would satisfy clause 1 above.</p></li></ol></div><div class="div1"> <h2><a name="sec-entexpand" id="sec-entexpand" />C Expansion of Entity and Character References (Non-Normative)</h2><p>This appendix contains some examples illustrating the sequence of entity- | ||
1419 | and character-reference recognition and expansion, as specified in <a href="#entproc"><b>4.4 XML Processor Treatment of Entities and References</b></a>.</p><p>If the DTD contains the declaration</p><div class="exampleInner"><pre><!ENTITY example "<p>An ampersand (&#38;#38;) may be escaped | ||
1420 | numerically (&#38;#38;#38;) or with a general entity | ||
1421 | (&amp;amp;).</p>" ></pre></div><p>then the XML processor will recognize the character references when it | ||
1422 | parses the entity declaration, and resolve them before storing the following | ||
1423 | string as the value of the entity "<code>example</code>":</p><div class="exampleInner"><pre><p>An ampersand (&#38;) may be escaped | ||
1424 | numerically (&#38;#38;) or with a general entity | ||
1425 | (&amp;amp;).</p></pre></div><p>A reference in the document to "<code>&example;</code>" | ||
1426 | will cause the text to be reparsed, at which time the start- and end-tags | ||
1427 | of the <code>p</code> element will be recognized and the three references will | ||
1428 | be recognized and expanded, resulting in a <code>p</code> element with the following | ||
1429 | content (all data, no delimiters or markup):</p><div class="exampleInner"><pre>An ampersand (&) may be escaped | ||
1430 | numerically (&#38;) or with a general entity | ||
1431 | (&amp;).</pre></div><p>A more complex example will illustrate the rules and their effects fully. | ||
1432 | In the following example, the line numbers are solely for reference.</p><div class="exampleInner"><pre>1 <?xml version='1.0'?> | ||
1433 | 2 <!DOCTYPE test [ | ||
1434 | 3 <!ELEMENT test (#PCDATA) > | ||
1435 | 4 <!ENTITY % xx '&#37;zz;'> | ||
1436 | 5 <!ENTITY % zz '&#60;!ENTITY tricky "error-prone" >' > | ||
1437 | 6 %xx; | ||
1438 | 7 ]> | ||
1439 | 8 <test>This sample shows a &tricky; method.</test></pre></div><p>This produces the following:</p><ul><li><p>in line 4, the reference to character 37 is expanded immediately, | ||
1440 | and the parameter entity "<code>xx</code>" is stored in the symbol | ||
1441 | table with the value "<code>%zz;</code>". Since the replacement | ||
1442 | text is not rescanned, the reference to parameter entity "<code>zz</code>" | ||
1443 | is not recognized. (And it would be an error if it were, since "<code>zz</code>" | ||
1444 | is not yet declared.)</p></li><li><p>in line 5, the character reference "<code>&#60;</code>" | ||
1445 | is expanded immediately and the parameter entity "<code>zz</code>" | ||
1446 | is stored with the replacement text "<code><!ENTITY tricky "error-prone" | ||
1447 | ></code>", which is a well-formed entity declaration.</p></li><li><p>in line 6, the reference to "<code>xx</code>" is recognized, | ||
1448 | and the replacement text of "<code>xx</code>" (namely "<code>%zz;</code>") | ||
1449 | is parsed. The reference to "<code>zz</code>" is recognized in | ||
1450 | its turn, and its replacement text ("<code><!ENTITY tricky "error-prone" | ||
1451 | ></code>") is parsed. The general entity "<code>tricky</code>" | ||
1452 | has now been declared, with the replacement text "<code>error-prone</code>".</p></li><li><p>in line 8, the reference to the general entity "<code>tricky</code>" | ||
1453 | is recognized, and it is expanded, so the full content of the <code>test</code> | ||
1454 | element is the self-describing (and ungrammatical) string <em>This sample | ||
1455 | shows a error-prone method.</em></p></li></ul></div><div class="div1"> <h2><a name="determinism" id="determinism" />D Deterministic Content Models (Non-Normative)</h2><p>As | ||
1456 | noted in <a href="#sec-element-content"><b>3.2.1 Element Content</b></a>, it is required that content | ||
1457 | models in element type declarations be deterministic. This requirement is <a title="For Compatibility" href="#dt-compat">for compatibility</a> with SGML (which calls deterministic | ||
1458 | content models "unambiguous"); XML processors built | ||
1459 | using SGML systems may flag non-deterministic content models as errors.</p><p>For example, the content model <code>((b, c) | (b, d))</code> is non-deterministic, | ||
1460 | because given an initial <code>b</code> the XML processor | ||
1461 | cannot know which <code>b</code> in the model is being matched without looking | ||
1462 | ahead to see which element follows the <code>b</code>. In this case, the two references | ||
1463 | to <code>b</code> can be collapsed into a single reference, making the model read <code>(b, | ||
1464 | (c | d))</code>. An initial <code>b</code> now clearly matches only a single name | ||
1465 | in the content model. The processor doesn't need to look ahead to see what follows; either <code>c</code> or <code>d</code> | ||
1466 | would be accepted.</p><p>More formally: a finite state automaton may be constructed from the content | ||
1467 | model using the standard algorithms, e.g. algorithm 3.5 in section 3.9 of | ||
1468 | Aho, Sethi, and Ullman <a href="#Aho">[Aho/Ullman]</a>. In many such algorithms, a follow | ||
1469 | set is constructed for each position in the regular expression (i.e., each | ||
1470 | leaf node in the syntax tree for the regular expression); if any position | ||
1471 | has a follow set in which more than one following position is labeled with | ||
1472 | the same element type name, then the content model is in error and may be | ||
1473 | reported as an error.</p><p>Algorithms exist which allow many but not all non-deterministic content | ||
1474 | models to be reduced automatically to equivalent deterministic models; see | ||
1475 | Brüggemann-Klein 1991 <a href="#ABK">[Brüggemann-Klein]</a>.</p></div><div class="div1"> <h2><a name="sec-guessing" id="sec-guessing" />E Autodetection of Character Encodings (Non-Normative)</h2><p>The XML encoding declaration functions as an internal label on each entity, | ||
1476 | indicating which character encoding is in use. Before an XML processor can | ||
1477 | read the internal label, however, it apparently has to know what character | ||
1478 | encoding is in use — which is what the internal label is trying to indicate. | ||
1479 | In the general case, this is a hopeless situation. It is not entirely hopeless | ||
1480 | in XML, however, because XML limits the general case in two ways: each implementation | ||
1481 | is assumed to support only a finite set of character encodings, and the XML | ||
1482 | encoding declaration is restricted in position and content in order to make | ||
1483 | it feasible to autodetect the character encoding in use in each entity in | ||
1484 | normal cases. Also, in many cases other sources of information are available | ||
1485 | in addition to the XML data stream itself. Two cases may be distinguished, | ||
1486 | depending on whether the XML entity is presented to the processor without, | ||
1487 | or with, any accompanying (external) information. We consider the first case | ||
1488 | first.</p><div class="div2"> <h3><a name="sec-guessing-no-ext-info" id="sec-guessing-no-ext-info" />E.1 Detection Without External Encoding Information</h3><p>Because each XML entity not accompanied by external | ||
1489 | encoding information and not in UTF-8 or UTF-16 encoding must | ||
1490 | begin with an XML encoding declaration, in which the first characters must | ||
1491 | be '<code><?xml</code>', any conforming processor can detect, after two | ||
1492 | to four octets of input, which of the following cases apply. In reading this | ||
1493 | list, it may help to know that in UCS-4, '<' is "<code>#x0000003C</code>" | ||
1494 | and '?' is "<code>#x0000003F</code>", and the Byte Order Mark | ||
1495 | required of UTF-16 data streams is "<code>#xFEFF</code>". The notation | ||
1496 | <var>##</var> is used to denote any byte value except that two consecutive | ||
1497 | <var>##</var>s cannot be both 00.</p><p>With a Byte Order Mark:</p><table border="1" frame="border" summary="Encoding detection summary"><tbody><tr><td rowspan="1" colspan="1"><code>00 00 FE | ||
1498 | FF</code></td><td rowspan="1" colspan="1">UCS-4, big-endian machine (1234 order)</td></tr><tr><td rowspan="1" colspan="1"><code>FF | ||
1499 | FE 00 00</code></td><td rowspan="1" colspan="1">UCS-4, little-endian machine (4321 order)</td></tr><tr><td rowspan="1" colspan="1"><code>00 00 FF FE</code></td><td rowspan="1" colspan="1">UCS-4, unusual octet order (2143)</td></tr><tr><td rowspan="1" colspan="1"><code>FE FF 00 00</code></td><td rowspan="1" colspan="1">UCS-4, unusual octet order (3412)</td></tr><tr><td rowspan="1" colspan="1"><code>FE FF ## ##</code></td><td rowspan="1" colspan="1">UTF-16, big-endian</td></tr><tr><td rowspan="1" colspan="1"><code>FF FE ## ##</code></td><td rowspan="1" colspan="1">UTF-16, little-endian</td></tr><tr><td rowspan="1" colspan="1"><code>EF BB BF</code></td><td rowspan="1" colspan="1">UTF-8</td></tr></tbody></table><p>Without a Byte Order Mark:</p><table border="1" frame="border" summary="Encoding detection summary"><tbody><tr><td rowspan="1" colspan="1"><code>00 00 00 3C</code></td><td rowspan="4" colspan="1">UCS-4 or other encoding with a 32-bit code unit and ASCII | ||
1500 | characters encoded as ASCII values, in respectively big-endian (1234), little-endian | ||
1501 | (4321) and two unusual byte orders (2143 and 3412). The encoding declaration | ||
1502 | must be read to determine which of UCS-4 or other supported 32-bit encodings | ||
1503 | applies.</td></tr><tr><td rowspan="1" colspan="1"><code>3C 00 00 00</code></td></tr><tr><td rowspan="1" colspan="1"><code>00 00 3C 00</code></td></tr><tr><td rowspan="1" colspan="1"><code>00 3C 00 00</code></td></tr><tr><td rowspan="1" colspan="1"><code>00 3C 00 3F</code></td><td rowspan="1" colspan="1">UTF-16BE or big-endian ISO-10646-UCS-2 | ||
1504 | or other encoding with a 16-bit code unit in big-endian order and ASCII characters | ||
1505 | encoded as ASCII values (the encoding declaration must be read to determine | ||
1506 | which)</td></tr><tr><td rowspan="1" colspan="1"><code>3C 00 3F 00</code></td><td rowspan="1" colspan="1">UTF-16LE or little-endian | ||
1507 | ISO-10646-UCS-2 or other encoding with a 16-bit code unit in little-endian | ||
1508 | order and ASCII characters encoded as ASCII values (the encoding declaration | ||
1509 | must be read to determine which)</td></tr><tr><td rowspan="1" colspan="1"><code>3C 3F 78 6D</code></td><td rowspan="1" colspan="1">UTF-8, ISO 646, ASCII, some part of ISO 8859, Shift-JIS, EUC, or any other | ||
1510 | 7-bit, 8-bit, or mixed-width encoding which ensures that the characters of | ||
1511 | ASCII have their normal positions, width, and values; the actual encoding | ||
1512 | declaration must be read to detect which of these applies, but since all of | ||
1513 | these encodings use the same bit patterns for the relevant ASCII characters, | ||
1514 | the encoding declaration itself may be read reliably</td></tr><tr><td rowspan="1" colspan="1"><code>4C | ||
1515 | 6F A7 94</code></td><td rowspan="1" colspan="1">EBCDIC (in some flavor; the full encoding declaration | ||
1516 | must be read to tell which code page is in use)</td></tr><tr><td rowspan="1" colspan="1">Other</td><td rowspan="1" colspan="1">UTF-8 without an encoding declaration, or else the data stream is mislabeled | ||
1517 | (lacking a required encoding declaration), corrupt, fragmentary, or enclosed | ||
1518 | in a wrapper of some kind</td></tr></tbody></table><div class="note"><p class="prefix"><b>Note:</b></p><p>In cases above which do not require reading the encoding declaration to | ||
1519 | determine the encoding, section 4.3.3 still requires that the encoding declaration, | ||
1520 | if present, be read and that the encoding name be checked to match the actual | ||
1521 | encoding of the entity. Also, it is possible that new character encodings | ||
1522 | will be invented that will make it necessary to use the encoding declaration | ||
1523 | to determine the encoding, in cases where this is not required at present.</p></div><p>This level of autodetection is enough to read the XML encoding declaration | ||
1524 | and parse the character-encoding identifier, which is still necessary to distinguish | ||
1525 | the individual members of each family of encodings (e.g. to tell UTF-8 from | ||
1526 | 8859, and the parts of 8859 from each other, or to distinguish the specific | ||
1527 | EBCDIC code page in use, and so on).</p><p>Because the contents of the encoding declaration are restricted to characters | ||
1528 | from the ASCII repertoire (however encoded), | ||
1529 | a processor can reliably read the entire encoding declaration as soon as it | ||
1530 | has detected which family of encodings is in use. Since in practice, all widely | ||
1531 | used character encodings fall into one of the categories above, the XML encoding | ||
1532 | declaration allows reasonably reliable in-band labeling of character encodings, | ||
1533 | even when external sources of information at the operating-system or transport-protocol | ||
1534 | level are unreliable. Character encodings such as UTF-7 | ||
1535 | that make overloaded usage of ASCII-valued bytes may fail to be reliably detected.</p><p>Once the processor has detected the character encoding in use, it can act | ||
1536 | appropriately, whether by invoking a separate input routine for each case, | ||
1537 | or by calling the proper conversion function on each character of input.</p><p>Like any self-labeling system, the XML encoding declaration will not work | ||
1538 | if any software changes the entity's character set or encoding without updating | ||
1539 | the encoding declaration. Implementors of character-encoding routines should | ||
1540 | be careful to ensure the accuracy of the internal and external information | ||
1541 | used to label the entity.</p></div><div class="div2"> <h3><a name="sec-guessing-with-ext-info" id="sec-guessing-with-ext-info" />E.2 Priorities in the Presence of External Encoding Information</h3><p>The second possible case occurs when the XML entity is accompanied by encoding | ||
1542 | information, as in some file systems and some network protocols. When multiple | ||
1543 | sources of information are available, their relative priority and the preferred | ||
1544 | method of handling conflict should be specified as part of the higher-level | ||
1545 | protocol used to deliver XML. In particular, please refer | ||
1546 | to <a href="#rfc2376">[IETF RFC 3023]</a> or its successor, which defines the <code>text/xml</code> | ||
1547 | and <code>application/xml</code> MIME types and provides some useful guidance. | ||
1548 | In the interests of interoperability, however, the following rule is recommended.</p><ul><li><p>If an XML entity is in a file, the Byte-Order Mark and encoding declaration are used | ||
1549 | (if present) to determine the character encoding.</p></li></ul></div></div><div class="div1"> <h2><a name="sec-xml-wg" id="sec-xml-wg" />F W3C XML Working Group (Non-Normative)</h2><p>This specification was prepared and approved for publication by the W3C | ||
1550 | XML Working Group (WG). WG approval of this specification does not necessarily | ||
1551 | imply that all WG participants voted for its approval. The current and former members | ||
1552 | in the XML WG are:</p><ul><li>Jon Bosak, Sun (<i>Chair</i>) </li><li>James Clark (<i>Technical Lead</i>) </li><li>Tim Bray, Textuality and Netscape (<i>XML Co-editor</i>) </li><li>Jean Paoli, Microsoft (<i>XML | ||
1553 | Co-editor</i>) </li><li>C. M. Sperberg-McQueen, U. of Ill. (<i>XML Co-editor</i>) </li><li>Dan Connolly, W3C (<i>W3C Liaison</i>) </li><li>Paula Angerstein, Texcel</li><li>Steve DeRose, INSO</li><li>Dave Hollander, HP</li><li>Eliot Kimber, ISOGEN</li><li>Eve Maler, ArborText</li><li>Tom Magliery, NCSA</li><li>Murray Maloney, SoftQuad, Grif | ||
1554 | SA, Muzmo and Veo Systems</li><li>MURATA Makoto (FAMILY Given), Fuji | ||
1555 | Xerox Information Systems</li><li>Joel Nava, Adobe</li><li>Conleth O'Connell, Vignette</li><li>Peter Sharpe, SoftQuad</li><li>John Tigue, DataChannel</li></ul></div><div class="div1"> <h2><a name="sec-core-wg" id="sec-core-wg" />G W3C XML Core <span>Working</span> Group (Non-Normative)</h2><p>The present edition of this specification was prepared by the W3C XML Core | ||
1556 | Working Group (WG). The participants in the WG at the time of publication of this | ||
1557 | edition were:</p><ul><li>Leonid Arbouzov, Sun Microsystems</li><li>Mary Brady</li><li>John Cowan (<i>XML 1.1 First Edition Editor</i>) </li><li>John Evdemon, Microsoft</li><li>Andrew Fang, Arbortext</li><li>Paul Grosso, Arbortext (<i>Co-Chair</i>) </li><li>Arnaud Le Hors, IBM</li><li>Dmitry Lenkov, Oracle</li><li>Anjana Manian, Oracle</li><li>Glenn Marcy, IBM</li><li>Jonathan Marsh, Microsoft</li><li>Sandra Martinez, NIST</li><li>Liam Quin, W3C (<i>Staff Contact</i>) </li><li>Lew Shannon</li><li>Richard Tobin, University of Edinburgh</li><li>Daniel Veillard</li><li>Norman Walsh, Sun Microsystems (<i>Co-Chair</i>) </li><li>François Yergeau</li></ul></div><div class="div1"> <h2><a name="prod-notes" id="prod-notes" />H Production Notes (Non-Normative)</h2><p>This edition was encoded in a | ||
1558 | slightly modified version of the | ||
1559 | <a href="http://www.w3.org/2002/xmlspec/dtd/2.5/xmlspec.dtd">XMLspec DTD, 2.5</a>. | ||
1560 | The XHTML versions were produced with a combination of the | ||
1561 | <a href="http://www.w3.org/2002/xmlspec/xhtml/1.9/xmlspec.xsl">xmlspec.xsl</a>, | ||
1562 | <a href="http://www.w3.org/2002/xmlspec/xhtml/1.9/diffspec.xsl">diffspec.xsl</a>, | ||
1563 | and <a href="REC-xml-3e.xsl">REC-xml-3e.xsl</a> | ||
1564 | XSLT stylesheets.</p></div><div class="div1"> <h2><a name="sec-suggested-names" id="sec-suggested-names" />I Suggestions for XML Names (Non-Normative)</h2><p>The following suggestions define what is believed to be best | ||
1565 | practice in the construction of XML names used as element names, | ||
1566 | attribute names, processing instruction targets, entity names, | ||
1567 | notation names, and the values of attributes of type ID, and are | ||
1568 | intended as guidance for document authors and schema designers. | ||
1569 | All references to Unicode are understood with respect to | ||
1570 | a particular version of the Unicode Standard greater than or equal | ||
1571 | to 3.0; which version should be used is left to the discretion of | ||
1572 | the document author or schema designer.</p><p>The first two suggestions are directly derived from the rules | ||
1573 | given for identifiers in the Unicode Standard, version 3.0, and | ||
1574 | exclude all control characters, enclosing nonspacing marks, | ||
1575 | non-decimal numbers, private-use characters, punctuation characters | ||
1576 | (with the noted exceptions), symbol characters, unassigned | ||
1577 | codepoints, and white space characters. The other suggestions | ||
1578 | are mostly derived from <a href="#XML1.0">[XML-1.0]</a> Appendix B.</p><ol type="1"><li><p>The first character of any name should have a Unicode General | ||
1579 | Category of Ll, Lu, Lo, Lm, Lt, or Nl, or else be '_' #x5F.</p></li><li><p>Characters other than the first should have a Unicode General | ||
1580 | Category of Ll, Lu, Lo, Lm, Lt, Mc, Mn, Nl, Nd, Pc, or Cf, or else | ||
1581 | be one of the following: '-' #x2D, '.' #x2E, ':' #x3A or | ||
1582 | '·' #xB7 (middle dot). Since Cf characters are not | ||
1583 | directly visible, they should be employed with caution and only | ||
1584 | when necessary, to avoid creating names which are distinct to XML | ||
1585 | processors but look the same to human beings.</p></li><li><p>Ideographic characters which have a canonical decomposition | ||
1586 | (including those in the ranges [#xF900-#xFAFF] and | ||
1587 | [#x2F800-#x2FFFD], with 12 exceptions) should not be used in names. | ||
1588 | </p></li><li><p>Characters which have a compatibility decomposition (those with | ||
1589 | a "compatibility formatting tag" in field 5 of the Unicode | ||
1590 | Character Database -- marked by field 5 beginning with a "<") | ||
1591 | should not be used in names. This suggestion does not apply | ||
1592 | to #x0E33 THAI CHARACTER SARA AM or #x0EB3 LAO CHARACTER AM, which | ||
1593 | despite their compatibility decompositions are in regular use in | ||
1594 | those scripts.</p></li><li><p>Combining characters meant for use with symbols only (including | ||
1595 | those in the ranges [#x20D0-#x20EF] and [#x1D165-#x1D1AD]) should | ||
1596 | not be used in names.</p></li><li><p>The interlinear annotation characters ([#xFFF9-#xFFFB) should | ||
1597 | not be used in names.</p></li><li><p>Variation selector characters should not be used in names.</p></li><li><p>Names which are nonsensical, unpronounceable, hard to read, or | ||
1598 | easily confusable with other names should not be employed.</p></li></ol></div></div></body></html> | ||
diff --git a/src/tests/xml.cpp b/src/tests/xml.cpp new file mode 100644 index 0000000..9ef6a7e --- /dev/null +++ b/src/tests/xml.cpp | |||
@@ -0,0 +1,15 @@ | |||
1 | #include "bu/xmlreader.h" | ||
2 | #include "bu/xmlnode.h" | ||
3 | #include "bu/xmldocument.h" | ||
4 | #include "bu/file.h" | ||
5 | |||
6 | int main() | ||
7 | { | ||
8 | Bu::File f("test.xml", "r"); | ||
9 | Bu::XmlReader xr( f ); | ||
10 | |||
11 | xr.read(); | ||
12 | |||
13 | return 0; | ||
14 | } | ||
15 | |||
diff --git a/src/tsfdocument.cpp b/src/tsfdocument.cpp new file mode 100644 index 0000000..582f1b1 --- /dev/null +++ b/src/tsfdocument.cpp | |||
@@ -0,0 +1,9 @@ | |||
1 | #include "tsfdocument.h" | ||
2 | |||
3 | Bu::TsfDocument::TsfDocument() | ||
4 | { | ||
5 | } | ||
6 | |||
7 | Bu::TsfDocument::~TsfDocument() | ||
8 | { | ||
9 | } | ||
diff --git a/src/tsfdocument.h b/src/tsfdocument.h new file mode 100644 index 0000000..e324459 --- /dev/null +++ b/src/tsfdocument.h | |||
@@ -0,0 +1,22 @@ | |||
1 | #ifndef TSF_DOCUMENT_H | ||
2 | #define TSF_DOCUMENT_H | ||
3 | |||
4 | #include <stdint.h> | ||
5 | |||
6 | namespace Bu | ||
7 | { | ||
8 | /** | ||
9 | * | ||
10 | */ | ||
11 | class TsfDocument | ||
12 | { | ||
13 | public: | ||
14 | TsfDocument(); | ||
15 | virtual ~TsfDocument(); | ||
16 | |||
17 | private: | ||
18 | |||
19 | }; | ||
20 | } | ||
21 | |||
22 | #endif | ||
diff --git a/src/tsfnode.cpp b/src/tsfnode.cpp new file mode 100644 index 0000000..19df4ed --- /dev/null +++ b/src/tsfnode.cpp | |||
@@ -0,0 +1,9 @@ | |||
1 | #include "tsfnode.h" | ||
2 | |||
3 | Bu::TsfNode::TsfNode() | ||
4 | { | ||
5 | } | ||
6 | |||
7 | Bu::TsfNode::~TsfNode() | ||
8 | { | ||
9 | } | ||
diff --git a/src/tsfnode.h b/src/tsfnode.h new file mode 100644 index 0000000..f58b825 --- /dev/null +++ b/src/tsfnode.h | |||
@@ -0,0 +1,21 @@ | |||
1 | #ifndef TSF_NODE_H | ||
2 | #define TSF_NODE_H | ||
3 | |||
4 | #include <stdint.h> | ||
5 | |||
6 | namespace Bu | ||
7 | { | ||
8 | /** | ||
9 | * | ||
10 | */ | ||
11 | class TsfNode | ||
12 | { | ||
13 | public: | ||
14 | TsfNode(); | ||
15 | virtual ~TsfNode(); | ||
16 | |||
17 | private: | ||
18 | |||
19 | }; | ||
20 | } | ||
21 | #endif | ||
diff --git a/src/tsfreader.cpp b/src/tsfreader.cpp new file mode 100644 index 0000000..58f4f78 --- /dev/null +++ b/src/tsfreader.cpp | |||
@@ -0,0 +1,9 @@ | |||
1 | #include "tsfreader.h" | ||
2 | |||
3 | Bu::TsfReader::TsfReader() | ||
4 | { | ||
5 | } | ||
6 | |||
7 | Bu::TsfReader::~TsfReader() | ||
8 | { | ||
9 | } | ||
diff --git a/src/tsfreader.h b/src/tsfreader.h new file mode 100644 index 0000000..cc8400a --- /dev/null +++ b/src/tsfreader.h | |||
@@ -0,0 +1,22 @@ | |||
1 | #ifndef TSF_READER_H | ||
2 | #define TSF_READER_H | ||
3 | |||
4 | #include <stdint.h> | ||
5 | |||
6 | namespace Bu | ||
7 | { | ||
8 | /** | ||
9 | * | ||
10 | */ | ||
11 | class TsfReader | ||
12 | { | ||
13 | public: | ||
14 | TsfReader(); | ||
15 | virtual ~TsfReader(); | ||
16 | |||
17 | private: | ||
18 | |||
19 | }; | ||
20 | } | ||
21 | |||
22 | #endif | ||
diff --git a/src/tsfwriter.cpp b/src/tsfwriter.cpp new file mode 100644 index 0000000..6592996 --- /dev/null +++ b/src/tsfwriter.cpp | |||
@@ -0,0 +1,9 @@ | |||
1 | #include "tsfwriter.h" | ||
2 | |||
3 | Bu::TsfWriter::TsfWriter() | ||
4 | { | ||
5 | } | ||
6 | |||
7 | Bu::TsfWriter::~TsfWriter() | ||
8 | { | ||
9 | } | ||
diff --git a/src/tsfwriter.h b/src/tsfwriter.h new file mode 100644 index 0000000..18f19d6 --- /dev/null +++ b/src/tsfwriter.h | |||
@@ -0,0 +1,22 @@ | |||
1 | #ifndef TSF_WRITER_H | ||
2 | #define TSF_WRITER_H | ||
3 | |||
4 | #include <stdint.h> | ||
5 | |||
6 | namespace Bu | ||
7 | { | ||
8 | /** | ||
9 | * | ||
10 | */ | ||
11 | class TsfWriter | ||
12 | { | ||
13 | public: | ||
14 | TsfWriter(); | ||
15 | virtual ~TsfWriter(); | ||
16 | |||
17 | private: | ||
18 | |||
19 | }; | ||
20 | } | ||
21 | |||
22 | #endif | ||
diff --git a/src/xmldocument.cpp b/src/xmldocument.cpp new file mode 100644 index 0000000..cb21826 --- /dev/null +++ b/src/xmldocument.cpp | |||
@@ -0,0 +1,9 @@ | |||
1 | #include "xmldocument.h" | ||
2 | |||
3 | Bu::XmlDocument::XmlDocument() | ||
4 | { | ||
5 | } | ||
6 | |||
7 | Bu::XmlDocument::~XmlDocument() | ||
8 | { | ||
9 | } | ||
diff --git a/src/xmldocument.h b/src/xmldocument.h new file mode 100644 index 0000000..e16e3ea --- /dev/null +++ b/src/xmldocument.h | |||
@@ -0,0 +1,22 @@ | |||
1 | #ifndef XML_DOCUMENT_H | ||
2 | #define XML_DOCUMENT_H | ||
3 | |||
4 | #include <stdint.h> | ||
5 | |||
6 | namespace Bu | ||
7 | { | ||
8 | /** | ||
9 | * | ||
10 | */ | ||
11 | class XmlDocument | ||
12 | { | ||
13 | public: | ||
14 | XmlDocument(); | ||
15 | virtual ~XmlDocument(); | ||
16 | |||
17 | private: | ||
18 | |||
19 | }; | ||
20 | } | ||
21 | |||
22 | #endif | ||
diff --git a/src/xmlnode.cpp b/src/xmlnode.cpp new file mode 100644 index 0000000..58ef5c5 --- /dev/null +++ b/src/xmlnode.cpp | |||
@@ -0,0 +1,9 @@ | |||
1 | #include "xmlnode.h" | ||
2 | |||
3 | Bu::XmlNode::XmlNode() | ||
4 | { | ||
5 | } | ||
6 | |||
7 | Bu::XmlNode::~XmlNode() | ||
8 | { | ||
9 | } | ||
diff --git a/src/xmlnode.h b/src/xmlnode.h new file mode 100644 index 0000000..cd9961a --- /dev/null +++ b/src/xmlnode.h | |||
@@ -0,0 +1,22 @@ | |||
1 | #ifndef XML_NODE_H | ||
2 | #define XML_NODE_H | ||
3 | |||
4 | #include <stdint.h> | ||
5 | |||
6 | namespace Bu | ||
7 | { | ||
8 | /** | ||
9 | * | ||
10 | */ | ||
11 | class XmlNode | ||
12 | { | ||
13 | public: | ||
14 | XmlNode(); | ||
15 | virtual ~XmlNode(); | ||
16 | |||
17 | private: | ||
18 | |||
19 | }; | ||
20 | } | ||
21 | |||
22 | #endif | ||
diff --git a/src/xmlreader.cpp b/src/xmlreader.cpp new file mode 100644 index 0000000..432ecc1 --- /dev/null +++ b/src/xmlreader.cpp | |||
@@ -0,0 +1,108 @@ | |||
1 | #include "xmlreader.h" | ||
2 | |||
3 | Bu::XmlReader::XmlReader( Bu::Stream &sIn ) : | ||
4 | sIn( sIn ) | ||
5 | { | ||
6 | } | ||
7 | |||
8 | Bu::XmlReader::~XmlReader() | ||
9 | { | ||
10 | } | ||
11 | |||
12 | const char *Bu::XmlReader::lookahead( int nAmnt ) | ||
13 | { | ||
14 | if( sBuf.getSize() >= nAmnt ) | ||
15 | return sBuf.getStr(); | ||
16 | |||
17 | int nNew = nAmnt - sBuf.getSize(); | ||
18 | char *buf = new char[nNew]; | ||
19 | sIn.read( buf, nNew ); | ||
20 | sBuf.append( buf ); | ||
21 | |||
22 | return sBuf.getStr(); | ||
23 | } | ||
24 | |||
25 | void Bu::XmlReader::burn( int nAmnt ) | ||
26 | { | ||
27 | if( sBuf.getSize() < nAmnt ) | ||
28 | { | ||
29 | lookahead( nAmnt ); | ||
30 | } | ||
31 | |||
32 | sBuf.remove( nAmnt ); | ||
33 | } | ||
34 | |||
35 | void Bu::XmlNode::checkString( const char *str, int nLen ) | ||
36 | { | ||
37 | if( !strncmp( str, lookahead( nLen ), nLen ) ) | ||
38 | { | ||
39 | burn( nLen ); | ||
40 | return; | ||
41 | } | ||
42 | |||
43 | throw Bu::ExceptionBase("Expected string '%s'", str ); | ||
44 | } | ||
45 | |||
46 | Bu::XmlNode *Bu::XmlReader::read() | ||
47 | { | ||
48 | prolog(); | ||
49 | } | ||
50 | |||
51 | void Bu::XmlReader::prolog() | ||
52 | { | ||
53 | XMLDecl(); | ||
54 | Misc(); | ||
55 | } | ||
56 | |||
57 | void Bu::XmlReader::XMLDecl() | ||
58 | { | ||
59 | checkString("<?xml", 5 ); | ||
60 | VersionInfo(); | ||
61 | EncodingDecl(); | ||
62 | SDDecl(); | ||
63 | S(); | ||
64 | } | ||
65 | |||
66 | void Bu::XmlReader::Misc() | ||
67 | { | ||
68 | } | ||
69 | |||
70 | void Bu::XmlReader::S() | ||
71 | { | ||
72 | for( int j = 0;; j++ ) | ||
73 | { | ||
74 | char c = *lookahead( 1 ); | ||
75 | if( c == 0x20 || c == 0x9 || c == 0xD || c == 0xA ) | ||
76 | continue; | ||
77 | if( j == 0 ) | ||
78 | printf("Error, expected whitespace!\n"); | ||
79 | return; | ||
80 | } | ||
81 | } | ||
82 | |||
83 | void Bu::XmlReader::S() | ||
84 | { | ||
85 | for(;;) | ||
86 | { | ||
87 | char c = *lookahead( 1 ); | ||
88 | if( c == 0x20 || c == 0x9 || c == 0xD || c == 0xA ) | ||
89 | continue; | ||
90 | return; | ||
91 | } | ||
92 | } | ||
93 | |||
94 | void Bu::XmlReader::VersionInfo() | ||
95 | { | ||
96 | S(); | ||
97 | checkString("version", 7 ); | ||
98 | |||
99 | } | ||
100 | |||
101 | void Bu::XmlReader::Eq() | ||
102 | { | ||
103 | Sq(); | ||
104 | checkString("=", 1 ); | ||
105 | Sq(); | ||
106 | } | ||
107 | |||
108 | |||
diff --git a/src/xmlreader.h b/src/xmlreader.h new file mode 100644 index 0000000..19791c4 --- /dev/null +++ b/src/xmlreader.h | |||
@@ -0,0 +1,70 @@ | |||
1 | #ifndef XML_READER_H | ||
2 | #define XML_READER_H | ||
3 | |||
4 | #include <stdint.h> | ||
5 | #include "bu/stream.h" | ||
6 | #include "bu/fstring.h" | ||
7 | #include "bu/xmlnode.h" | ||
8 | |||
9 | namespace Bu | ||
10 | { | ||
11 | /** | ||
12 | * | ||
13 | */ | ||
14 | class XmlReader | ||
15 | { | ||
16 | public: | ||
17 | XmlReader( Bu::Stream &sIn ); | ||
18 | virtual ~XmlReader(); | ||
19 | |||
20 | XmlNode *read(); | ||
21 | |||
22 | private: | ||
23 | Bu::Stream &sIn; | ||
24 | Bu::FString sBuf; | ||
25 | |||
26 | private: // Helpers | ||
27 | const char *lookahead( int nAmnt ); | ||
28 | void burn( int nAmnt ); | ||
29 | void checkString( const char *str, int nLen ); | ||
30 | |||
31 | private: // States | ||
32 | /** | ||
33 | * The headers, etc. | ||
34 | */ | ||
35 | void prolog(); | ||
36 | |||
37 | /** | ||
38 | * The xml decleration (version, encoding, etc). | ||
39 | */ | ||
40 | void XMLDecl(); | ||
41 | |||
42 | /** | ||
43 | * Misc things...? | ||
44 | */ | ||
45 | void Misc(); | ||
46 | |||
47 | /** | ||
48 | * Whitespace eater. | ||
49 | */ | ||
50 | void S(); | ||
51 | |||
52 | /** | ||
53 | * Optional whitespace eater. | ||
54 | */ | ||
55 | void Sq(); | ||
56 | |||
57 | /** | ||
58 | * XML Version spec | ||
59 | */ | ||
60 | void VersionInfo(); | ||
61 | |||
62 | /** | ||
63 | * Your basic equals sign with surrounding whitespace. | ||
64 | */ | ||
65 | void Eq(); | ||
66 | |||
67 | }; | ||
68 | } | ||
69 | |||
70 | #endif | ||
diff --git a/src/xmlwriter.cpp b/src/xmlwriter.cpp new file mode 100644 index 0000000..23a5175 --- /dev/null +++ b/src/xmlwriter.cpp | |||
@@ -0,0 +1,9 @@ | |||
1 | #include "xmlwriter.h" | ||
2 | |||
3 | Bu::XmlWriter::XmlWriter() | ||
4 | { | ||
5 | } | ||
6 | |||
7 | Bu::XmlWriter::~XmlWriter() | ||
8 | { | ||
9 | } | ||
diff --git a/src/xmlwriter.h b/src/xmlwriter.h new file mode 100644 index 0000000..796d6fb --- /dev/null +++ b/src/xmlwriter.h | |||
@@ -0,0 +1,22 @@ | |||
1 | #ifndef XML_WRITER_H | ||
2 | #define XML_WRITER_H | ||
3 | |||
4 | #include <stdint.h> | ||
5 | |||
6 | namespace Bu | ||
7 | { | ||
8 | /** | ||
9 | * | ||
10 | */ | ||
11 | class XmlWriter | ||
12 | { | ||
13 | public: | ||
14 | XmlWriter(); | ||
15 | virtual ~XmlWriter(); | ||
16 | |||
17 | private: | ||
18 | |||
19 | }; | ||
20 | } | ||
21 | |||
22 | #endif | ||