001/*
002 * Copyright (c) 2011+, HL7, Inc
003 * All rights reserved.
004 * 
005 * Redistribution and use in source and binary forms, with or without modification,
006 * are permitted provided that the following conditions are met:
007 * 
008 * Redistributions of source code must retain the above copyright notice, this
009 * list of conditions and the following disclaimer.
010 * Redistributions in binary form must reproduce the above copyright notice,
011 * this list of conditions and the following disclaimer in the documentation
012 * and/or other materials provided with the distribution.
013 * Neither the name of HL7 nor the names of its contributors may be used to
014 * endorse or promote products derived from this software without specific
015 * prior written permission.
016 * 
017 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
018 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
019 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
020 * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
021 * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
022 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
023 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
024 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
025 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
026 * POSSIBILITY OF SUCH DAMAGE.
027 * 
028 */
029package org.hl7.fhir.utilities.xhtml;
030
031/*
032 * #%L
033 * HAPI FHIR - Core Library
034 * %%
035 * Copyright (C) 2014 - 2017 University Health Network
036 * %%
037 * Licensed under the Apache License, Version 2.0 (the "License");
038 * you may not use this file except in compliance with the License.
039 * You may obtain a copy of the License at
040 * 
041 * http://www.apache.org/licenses/LICENSE-2.0
042 * 
043 * Unless required by applicable law or agreed to in writing, software
044 * distributed under the License is distributed on an "AS IS" BASIS,
045 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
046 * See the License for the specific language governing permissions and
047 * limitations under the License.
048 * #L%
049 */
050
051import java.io.*;
052import java.util.*;
053
054import javax.xml.stream.XMLEventReader;
055import javax.xml.stream.XMLStreamException;
056import javax.xml.stream.events.Attribute;
057import javax.xml.stream.events.Characters;
058import javax.xml.stream.events.Comment;
059import javax.xml.stream.events.StartElement;
060import javax.xml.stream.events.XMLEvent;
061
062import org.hl7.fhir.exceptions.FHIRException;
063import org.hl7.fhir.exceptions.FHIRFormatError;
064import org.w3c.dom.Attr;
065import org.w3c.dom.Element;
066import org.w3c.dom.Node;
067
068public class XhtmlParser {
069        public static final String XHTML_NS = "http://www.w3.org/1999/xhtml";
070
071        private Set<String> attributes = new HashSet<String>();
072
073        private String cache = "";
074
075        private int col = 0;
076        private Set<String> elements = new HashSet<String>();
077
078        private char lastChar;
079
080        private String lastText = "";
081
082        private int line = 1;
083
084        private boolean mustBeWellFormed = true;
085        private ParserSecurityPolicy policy;
086        private Reader rdr;
087
088        private boolean trimWhitespace;
089
090        private XhtmlNode unwindPoint;
091
092        private boolean validatorMode;
093
094        public XhtmlParser() {
095                super();
096                policy = ParserSecurityPolicy.Accept; // for general parsing
097
098                // set up sets
099                elements.add("p");
100                elements.add("br");
101                elements.add("div");
102                elements.add("h1");
103                elements.add("h2");
104                elements.add("h3");
105                elements.add("h4");
106                elements.add("h5");
107                elements.add("h6");
108                elements.add("a");
109                elements.add("span");
110                elements.add("b");
111                elements.add("em");
112                elements.add("i");
113                elements.add("strong");
114                elements.add("small");
115                elements.add("big");
116                elements.add("tt");
117                elements.add("small");
118                elements.add("dfn");
119                elements.add("q");
120                elements.add("var");
121                elements.add("abbr");
122                elements.add("acronym");
123                elements.add("cite");
124                elements.add("blockquote");
125                elements.add("hr");
126                elements.add("address");
127                elements.add("bdo");
128                elements.add("kbd");
129                elements.add("q");
130                elements.add("sub");
131                elements.add("sup");
132                elements.add("ul");
133                elements.add("ol");
134                elements.add("li");
135                elements.add("dl");
136                elements.add("dt");
137                elements.add("dd");
138                elements.add("pre");
139                elements.add("table");
140                elements.add("caption");
141                elements.add("colgroup");
142                elements.add("col");
143                elements.add("thead");
144                elements.add("tr");
145                elements.add("tfoot");
146                elements.add("tbody");
147                elements.add("th");
148                elements.add("td");
149                elements.add("code");
150                elements.add("samp");
151                elements.add("img");
152                elements.add("map");
153                elements.add("area");
154
155                attributes.add("title");
156                attributes.add("style");
157                attributes.add("class");
158                attributes.add("id");
159                attributes.add("lang");
160                attributes.add("xml:lang");
161                attributes.add("dir");
162                attributes.add("accesskey");
163                attributes.add("tabindex");
164                // tables:
165                attributes.add("span");
166                attributes.add("width");
167                attributes.add("align");
168                attributes.add("valign");
169                attributes.add("char");
170                attributes.add("charoff");
171                attributes.add("abbr");
172                attributes.add("axis");
173                attributes.add("headers");
174                attributes.add("scope");
175                attributes.add("rowspan");
176                attributes.add("colspan");
177
178                attributes.add("a.href");
179                attributes.add("a.name");
180                attributes.add("img.src");
181                attributes.add("img.border");
182                attributes.add("div.xmlns");
183                attributes.add("blockquote.cite");
184                attributes.add("q.cite");
185                attributes.add("a.charset");
186                attributes.add("a.type");
187                attributes.add("a.name");
188                attributes.add("a.href");
189                attributes.add("a.hreflang");
190                attributes.add("a.rel");
191                attributes.add("a.rev");
192                attributes.add("a.shape");
193                attributes.add("a.coords");
194                attributes.add("img.src");
195                attributes.add("img.alt");
196                attributes.add("img.longdesc");
197                attributes.add("img.height");
198                attributes.add("img.width");
199                attributes.add("img.usemap");
200                attributes.add("img.ismap");
201                attributes.add("map.name");
202                attributes.add("area.shape");
203                attributes.add("area.coords");
204                attributes.add("area.href");
205                attributes.add("area.nohref");
206                attributes.add("area.alt");
207                attributes.add("table.summary");
208                attributes.add("table.width");
209                attributes.add("table.border");
210                attributes.add("table.frame");
211                attributes.add("table.rules");
212                attributes.add("table.cellspacing");
213                attributes.add("table.cellpadding");
214        }
215
216        private void addTextNode(XhtmlNode node, StringBuilder s) {
217                String t = isTrimWhitespace() ? s.toString().trim() : s.toString();
218                if (t.length() > 0) {
219                        lastText = t;
220                        // System.out.println(t);
221                        node.addText(t);
222                        s.setLength(0);
223                }
224        }
225
226        private boolean attributeIsOk(String elem, String attr, String value) throws FHIRFormatError {
227                if (validatorMode)
228                        return true;
229                boolean ok = attributes.contains(attr) || attributes.contains(elem + "." + attr);
230                if (ok) {
231                        return true;
232                }
233                switch (policy) {
234                case Accept:
235                        return true;
236                case Drop:
237                        return false;
238                case Reject:
239                        throw new FHIRFormatError("Illegal HTML attribute " + elem + "." + attr);
240                }
241
242                if ((elem + "." + attr).equals("img.src") && !(value.startsWith("#") || value.startsWith("http:") || value.startsWith("https:"))) {
243                        switch (policy) {
244                        case Accept:
245                                return true;
246                        case Drop:
247                                return false;
248                        case Reject:
249                                throw new FHIRFormatError("Illegal Image Reference " + value);
250                        }
251                }
252                return false;
253        }
254
255        private NSMap checkNamespaces(QName n, XhtmlNode node, NSMap nsm, boolean root) {
256                // what we do here is strip out any stated namespace attributes, putting them in the namesapce map
257                // then we figure out what the namespace of this element is, and state it explicitly if it's not the default
258
259                // but we don't bother with any of this if we're not validating
260                if (!validatorMode)
261                        return null;
262                NSMap result = new NSMap(nsm);
263                List<String> nsattrs = new ArrayList<String>();
264                for (String an : node.getAttributes().keySet()) {
265                        if (an.equals("xmlns")) {
266                                result.def(node.getAttribute(an));
267                                nsattrs.add(an);
268                        }
269                        if (an.startsWith("xmlns:")) {
270                                result.ns(an.substring(6), node.getAttribute(an));
271                                nsattrs.add(an);
272                        }
273                }
274                for (String s : nsattrs)
275                        node.getAttributes().remove(s);
276                if (n.hasNs()) {
277                        String nns = result.get(n.getNs());
278                        if (!nns.equals(result.def())) {
279                                node.getAttributes().put("xmlns", nns);
280                                result.def(nns);
281                        }
282                } else if (root && result.hasDef()) {
283                        node.getAttributes().put("xmlns", result.def());
284                }
285                return result;
286        }
287
288        private String checkNS(XhtmlNode res, Element node, String defaultNS) {
289                if (!validatorMode)
290                        return null;
291                String ns = node.getNamespaceURI();
292                if (ns == null)
293                        return null;
294                if (!ns.equals(defaultNS)) {
295                        res.getAttributes().put("xmlns", ns);
296                        return ns;
297                }
298                return defaultNS;
299        }
300
301        private String descLoc() {
302                return " at line " + Integer.toString(line) + " column " + Integer.toString(col);
303        }
304
305        private boolean elementIsOk(String name) throws FHIRFormatError {
306                if (validatorMode)
307                        return true;
308                boolean ok = elements.contains(name);
309                if (ok){
310                        return true;
311                }
312                switch (policy) {
313                case Accept:
314                        return true;
315                case Drop:
316                        return false;
317                case Reject:
318                        throw new FHIRFormatError("Illegal HTML element " + name);
319                }
320                return false;
321        }
322
323        public ParserSecurityPolicy getPolicy() {
324                return policy;
325        }
326
327        private boolean isInteger(String s, int base) {
328                try {
329                        Integer.parseInt(s, base);
330                        return true;
331                } catch (Exception e) {
332                        return false;
333                }
334        }
335
336        public boolean isMustBeWellFormed() {
337                return mustBeWellFormed;
338        }
339
340        private boolean isNameChar(char ch) {
341                return Character.isLetterOrDigit(ch) || ch == '_' || ch == '-' || ch == ':';
342        }
343
344        public boolean isTrimWhitespace() {
345                return trimWhitespace;
346        }
347
348        public boolean isValidatorMode() {
349                return validatorMode;
350        }
351
352        public XhtmlDocument parse(InputStream input, String entryName) throws FHIRFormatError, IOException {
353                rdr = new InputStreamReader(input, "UTF-8");
354                return parse(entryName);
355        }
356
357        private XhtmlDocument parse(String entryName) throws FHIRFormatError, IOException {
358                XhtmlDocument result = new XhtmlDocument();
359                skipWhiteSpaceAndComments(result);
360                if (peekChar() != '<')
361                        throw new FHIRFormatError("Unable to Parse HTML - does not start with tag. Found " + peekChar() + descLoc());
362                readChar();
363                QName n = new QName(readName().toLowerCase());
364                if ((entryName != null) && !n.getName().equals(entryName))
365                        throw new FHIRFormatError("Unable to Parse HTML - starts with '" + n + "' not '" + entryName + "'" + descLoc());
366                XhtmlNode root = result.addTag(n.getName());
367                parseAttributes(root);
368                NSMap nsm = checkNamespaces(n, root, null, true);
369                if (readChar() == '/') {
370                        if (peekChar() != '>')
371                                throw new FHIRFormatError("unexpected non-end of element " + n + " " + descLoc());
372                        readChar();
373                } else {
374                        unwindPoint = null;
375                        List<XhtmlNode> p = new ArrayList<XhtmlNode>();
376                        parseElementInner(root, p, nsm);
377                }
378                return result;
379        }
380
381        public XhtmlDocument parse(String source, String entryName) throws FHIRFormatError, IOException {
382                rdr = new StringReader(source);
383                return parse(entryName);
384        }
385
386        private void parseAttributes(XhtmlNode node) throws FHIRFormatError, IOException {
387                while (Character.isWhitespace(peekChar()))
388                        readChar();
389                while (peekChar() != '>' && peekChar() != '/' && peekChar() != '\0') {
390                        String name = readName();
391                        if (name.length() == 0) {
392                                throw new FHIRFormatError("Unable to read attribute on <" + node.getName() + ">" + descLoc());
393                        }
394                        while (Character.isWhitespace(peekChar()))
395                                readChar();
396
397                        if (isNameChar(peekChar()) || peekChar() == '>' || peekChar() == '/')
398                                node.getAttributes().put(name, null);
399                        else if (peekChar() != '=') {
400                                throw new FHIRFormatError("Unable to read attribute '" + name + "' value on <" + node.getName() + ">" + descLoc());
401                        } else {
402                                readChar();
403                                while (Character.isWhitespace(peekChar()))
404                                        readChar();
405                                if (peekChar() == '"' || peekChar() == '\'')
406                                        node.getAttributes().put(name, parseAttributeValue(readChar()));
407                                else
408                                        node.getAttributes().put(name, parseAttributeValue('\0'));
409                        }
410                        while (Character.isWhitespace(peekChar()))
411                                readChar();
412                }
413        }
414
415        private String parseAttributeValue(char term) throws IOException, FHIRFormatError {
416                StringBuilder b = new StringBuilder();
417                while (peekChar() != '\0' && peekChar() != '>' && (term != '\0' || peekChar() != '/') && peekChar() != term) {
418                        if (peekChar() == '&') {
419                                parseLiteral(b);
420                        } else
421                                b.append(readChar());
422                }
423                if (peekChar() == term)
424                        readChar();
425                return b.toString();
426        }
427
428        private void parseElement(XhtmlNode parent, List<XhtmlNode> parents, NSMap nsm) throws IOException, FHIRFormatError {
429                QName name = new QName(readName());
430                XhtmlNode node = parent.addTag(name.getName());
431                List<XhtmlNode> newParents = new ArrayList<XhtmlNode>();
432                newParents.addAll(parents);
433                newParents.add(parent);
434                parseAttributes(node);
435                nsm = checkNamespaces(name, node, nsm, false);
436                if (readChar() == '/') {
437                        if (peekChar() != '>')
438                                throw new FHIRFormatError("unexpected non-end of element " + name + " " + descLoc());
439                        readChar();
440                } else {
441                        parseElementInner(node, newParents, nsm);
442                }
443        }
444
445        private void parseElementInner(XhtmlNode node, List<XhtmlNode> parents, NSMap nsm) throws FHIRFormatError, IOException {
446                StringBuilder s = new StringBuilder();
447                while (peekChar() != '\0' && !parents.contains(unwindPoint) && !(node == unwindPoint)) {
448                        if (peekChar() == '<') {
449                                addTextNode(node, s);
450                                readChar();
451                                if (peekChar() == '!') {
452                                        String sc = readToCommentEnd();
453                                        if (sc.startsWith("DOCTYPE"))
454                                                throw new FHIRFormatError("Malformed XHTML: Found a DocType declaration, and these are not allowed (XXE security vulnerability protection)");
455                                        node.addComment(sc);
456                                } else if (peekChar() == '?')
457                                        node.addComment(readToTagEnd());
458                                else if (peekChar() == '/') {
459                                        readChar();
460                                        QName n = new QName(readToTagEnd());
461                                        if (node.getName().equals(n.getName())){
462                                                return;
463                                        }
464                                        if (mustBeWellFormed)
465                                                throw new FHIRFormatError("Malformed XHTML: Found \"</" + n.getName() + ">\" expecting \"</" + node.getName() + ">\"" + descLoc());
466                                        for (int i = parents.size() - 1; i >= 0; i--) {
467                                                if (parents.get(i).getName().equals(n))
468                                                        unwindPoint = parents.get(i);
469                                        }
470                                        if (unwindPoint != null) {
471                                                for (int i = parents.size(); i > 0; i--) {
472                                                        if (i < parents.size() && parents.get(i) == unwindPoint)
473                                                                return;
474                                                        if (i == parents.size()) {
475                                                                parents.get(i - 1).getChildNodes().addAll(node.getChildNodes());
476                                                                node.getChildNodes().clear();
477                                                        } else {
478                                                                parents.get(i - 1).getChildNodes().addAll(parents.get(i).getChildNodes());
479                                                                parents.get(i).getChildNodes().clear();
480                                                        }
481                                                }
482                                        }
483                                } else if (Character.isLetterOrDigit(peekChar())) {
484                                        parseElement(node, parents, nsm);
485                                } else
486                                        throw new FHIRFormatError("Unable to Parse HTML - node '" + node.getName() + "' has unexpected content '" + peekChar() + "' (last text = '" + lastText + "'" + descLoc());
487                        } else if (peekChar() == '&') {
488                                parseLiteral(s);
489                        } else
490                                s.append(readChar());
491                }
492                addTextNode(node, s);
493        }
494
495        private XhtmlNode parseFragment() throws IOException, FHIRException {
496                skipWhiteSpace();
497                if (peekChar() != '<')
498                        throw new FHIRException("Unable to Parse HTML - does not start with tag. Found " + peekChar() + descLoc());
499                readChar();
500                if (peekChar() == '?') {
501                        readToTagEnd();
502                        skipWhiteSpace();
503                        if (peekChar() != '<')
504                                throw new FHIRException("Unable to Parse HTML - does not start with tag after processing instruction. Found " + peekChar() + descLoc());
505                        readChar();
506                }
507                String n = readName().toLowerCase();
508                readToTagEnd();
509                XhtmlNode result = new XhtmlNode(NodeType.Element);
510
511                int colonIndex = n.indexOf(':');
512                if (colonIndex != -1) {
513                        n = n.substring(colonIndex + 1);
514                }
515
516                result.setName(n);
517                unwindPoint = null;
518                List<XhtmlNode> p = new ArrayList<XhtmlNode>();
519                parseElementInner(result, p, null);
520
521                return result;
522        }
523
524        public XhtmlNode parseFragment(InputStream input) throws IOException, FHIRException {
525                rdr = new InputStreamReader(input);
526                return parseFragment();
527        }
528
529        public XhtmlNode parseFragment(String source) throws IOException, FHIRException {
530                rdr = new StringReader(source);
531                return parseFragment();
532        }
533
534        public XhtmlNode parseHtmlNode(Element node) throws FHIRFormatError {
535                return parseHtmlNode(node, null);
536        }
537
538        public XhtmlNode parseHtmlNode(Element node, String defaultNS) throws FHIRFormatError {
539                XhtmlNode res = parseNode(node, defaultNS);
540                if (res.getNsDecl() == null)
541                        res.getAttributes().put("xmlns", XHTML_NS);
542                return res;
543        }
544
545        public XhtmlNode parseHtmlNode(XMLEventReader xpp) throws IOException, FHIRFormatError, XMLStreamException {
546                XhtmlNode res = parseNode(xpp);
547                if (res.getNsDecl() == null)
548                        res.getAttributes().put("xmlns", XHTML_NS);
549                return res;
550
551        }
552
553        private void parseLiteral(StringBuilder s) throws IOException, FHIRFormatError {
554                // UInt16 w;
555                readChar();
556                String c = readUntil(';');
557                if (c.equals("apos"))
558                        s.append('\'');
559                else if (c.equals("quot"))
560                        s.append('"');
561                else if (c.equals("nbsp"))
562                        s.append(XhtmlNode.NBSP);
563                else if (c.equals("amp"))
564                        s.append('&');
565                else if (c.equals("rsquo"))
566                        s.append('’');
567                else if (c.equals("gt"))
568                        s.append('>');
569                else if (c.equals("lt"))
570                        s.append('<');
571                else if (c.equals("copy"))
572                        s.append((char) 169);
573                else if (c.equals("reg"))
574                        s.append((char) 174);
575                else if (c.equals("sect"))
576                        s.append((char) 0xA7);
577                else if (c.charAt(0) == '#') {
578                        if (isInteger(c.substring(1), 10))
579                                s.append((char) Integer.parseInt(c.substring(1)));
580                        else if (c.charAt(1) == 'x' && isInteger(c.substring(2), 16))
581                                s.append((char) Integer.parseInt(c.substring(2), 16));
582                } else if (c.equals("fnof"))
583                        s.append((char) 402); // latin small f with hook = function = florin, U+0192 ISOtech -->
584                else if (c.equals("Alpha"))
585                        s.append((char) 913); // greek capital letter alpha, U+0391
586                else if (c.equals("Beta"))
587                        s.append((char) 914); // greek capital letter beta, U+0392
588                else if (c.equals("Gamma"))
589                        s.append((char) 915); // greek capital letter gamma, U+0393 ISOgrk3
590                else if (c.equals("Delta"))
591                        s.append((char) 916); // greek capital letter delta, U+0394 ISOgrk3
592                else if (c.equals("Epsilon"))
593                        s.append((char) 917); // greek capital letter epsilon, U+0395
594                else if (c.equals("Zeta"))
595                        s.append((char) 918); // greek capital letter zeta, U+0396
596                else if (c.equals("Eta"))
597                        s.append((char) 919); // greek capital letter eta, U+0397
598                else if (c.equals("Theta"))
599                        s.append((char) 920); // greek capital letter theta, U+0398 ISOgrk3
600                else if (c.equals("Iota"))
601                        s.append((char) 921); // greek capital letter iota, U+0399
602                else if (c.equals("Kappa"))
603                        s.append((char) 922); // greek capital letter kappa, U+039A
604                else if (c.equals("Lambda"))
605                        s.append((char) 923); // greek capital letter lambda, U+039B ISOgrk3
606                else if (c.equals("Mu"))
607                        s.append((char) 924); // greek capital letter mu, U+039C
608                else if (c.equals("Nu"))
609                        s.append((char) 925); // greek capital letter nu, U+039D
610                else if (c.equals("Xi"))
611                        s.append((char) 926); // greek capital letter xi, U+039E ISOgrk3
612                else if (c.equals("Omicron"))
613                        s.append((char) 927); // greek capital letter omicron, U+039F
614                else if (c.equals("Pi"))
615                        s.append((char) 928); // greek capital letter pi, U+03A0 ISOgrk3
616                else if (c.equals("Rho"))
617                        s.append((char) 929); // greek capital letter rho, U+03A1
618                else if (c.equals("Sigma"))
619                        s.append((char) 931); // greek capital letter sigma, U+03A3 ISOgrk3
620                else if (c.equals("Tau"))
621                        s.append((char) 932); // greek capital letter tau, U+03A4
622                else if (c.equals("Upsilon"))
623                        s.append((char) 933); // greek capital letter upsilon, U+03A5 ISOgrk3
624                else if (c.equals("Phi"))
625                        s.append((char) 934); // greek capital letter phi, U+03A6 ISOgrk3
626                else if (c.equals("Chi"))
627                        s.append((char) 935); // greek capital letter chi, U+03A7
628                else if (c.equals("Psi"))
629                        s.append((char) 936); // greek capital letter psi, U+03A8 ISOgrk3
630                else if (c.equals("Omega"))
631                        s.append((char) 937); // greek capital letter omega, U+03A9 ISOgrk3
632                else if (c.equals("alpha"))
633                        s.append((char) 945); // greek small letter alpha, U+03B1 ISOgrk3
634                else if (c.equals("beta"))
635                        s.append((char) 946); // greek small letter beta, U+03B2 ISOgrk3
636                else if (c.equals("gamma"))
637                        s.append((char) 947); // greek small letter gamma, U+03B3 ISOgrk3
638                else if (c.equals("delta"))
639                        s.append((char) 948); // greek small letter delta, U+03B4 ISOgrk3
640                else if (c.equals("epsilon"))
641                        s.append((char) 949); // greek small letter epsilon, U+03B5 ISOgrk3
642                else if (c.equals("zeta"))
643                        s.append((char) 950); // greek small letter zeta, U+03B6 ISOgrk3
644                else if (c.equals("eta"))
645                        s.append((char) 951); // greek small letter eta, U+03B7 ISOgrk3
646                else if (c.equals("theta"))
647                        s.append((char) 952); // greek small letter theta, U+03B8 ISOgrk3
648                else if (c.equals("iota"))
649                        s.append((char) 953); // greek small letter iota, U+03B9 ISOgrk3
650                else if (c.equals("kappa"))
651                        s.append((char) 954); // greek small letter kappa, U+03BA ISOgrk3
652                else if (c.equals("lambda"))
653                        s.append((char) 955); // greek small letter lambda, U+03BB ISOgrk3
654                else if (c.equals("mu"))
655                        s.append((char) 956); // greek small letter mu, U+03BC ISOgrk3
656                else if (c.equals("nu"))
657                        s.append((char) 957); // greek small letter nu, U+03BD ISOgrk3
658                else if (c.equals("xi"))
659                        s.append((char) 958); // greek small letter xi, U+03BE ISOgrk3
660                else if (c.equals("omicron"))
661                        s.append((char) 959); // greek small letter omicron, U+03BF NEW
662                else if (c.equals("pi"))
663                        s.append((char) 960); // greek small letter pi, U+03C0 ISOgrk3
664                else if (c.equals("rho"))
665                        s.append((char) 961); // greek small letter rho, U+03C1 ISOgrk3
666                else if (c.equals("sigmaf"))
667                        s.append((char) 962); // greek small letter final sigma, U+03C2 ISOgrk3
668                else if (c.equals("sigma"))
669                        s.append((char) 963); // greek small letter sigma, U+03C3 ISOgrk3
670                else if (c.equals("tau"))
671                        s.append((char) 964); // greek small letter tau, U+03C4 ISOgrk3
672                else if (c.equals("upsilon"))
673                        s.append((char) 965); // greek small letter upsilon, U+03C5 ISOgrk3
674                else if (c.equals("phi"))
675                        s.append((char) 966); // greek small letter phi, U+03C6 ISOgrk3
676                else if (c.equals("chi"))
677                        s.append((char) 967); // greek small letter chi, U+03C7 ISOgrk3
678                else if (c.equals("psi"))
679                        s.append((char) 968); // greek small letter psi, U+03C8 ISOgrk3
680                else if (c.equals("omega"))
681                        s.append((char) 969); // greek small letter omega, U+03C9 ISOgrk3
682                else if (c.equals("thetasym"))
683                        s.append((char) 977); // greek small letter theta symbol, U+03D1 NEW
684                else if (c.equals("upsih"))
685                        s.append((char) 978); // greek upsilon with hook symbol, U+03D2 NEW
686                else if (c.equals("piv"))
687                        s.append((char) 982); // greek pi symbol, U+03D6 ISOgrk3
688                else if (c.equals("bull"))
689                        s.append((char) 8226); // bullet = black small circle, U+2022 ISOpub
690                else if (c.equals("hellip"))
691                        s.append((char) 8230); // horizontal ellipsis = three dot leader, U+2026 ISOpub
692                else if (c.equals("prime"))
693                        s.append((char) 8242); // prime = minutes = feet, U+2032 ISOtech
694                else if (c.equals("Prime"))
695                        s.append((char) 8243); // double prime = seconds = inches, U+2033 ISOtech
696                else if (c.equals("oline"))
697                        s.append((char) 8254); // overline = spacing overscore, U+203E NEW
698                else if (c.equals("frasl"))
699                        s.append((char) 8260); // fraction slash, U+2044 NEW
700                else if (c.equals("weierp"))
701                        s.append((char) 8472); // script capital P = power set = Weierstrass p, U+2118 ISOamso
702                else if (c.equals("image"))
703                        s.append((char) 8465); // blackletter capital I = imaginary part, U+2111 ISOamso
704                else if (c.equals("real"))
705                        s.append((char) 8476); // blackletter capital R = real part symbol, U+211C ISOamso
706                else if (c.equals("trade"))
707                        s.append((char) 8482); // trade mark sign, U+2122 ISOnum
708                else if (c.equals("alefsym"))
709                        s.append((char) 8501); // alef symbol = first transfinite cardinal, U+2135 NEW
710                else if (c.equals("larr"))
711                        s.append((char) 8592); // leftwards arrow, U+2190 ISOnum
712                else if (c.equals("uarr"))
713                        s.append((char) 8593); // upwards arrow, U+2191 ISOnum
714                else if (c.equals("rarr"))
715                        s.append((char) 8594); // rightwards arrow, U+2192 ISOnum
716                else if (c.equals("darr"))
717                        s.append((char) 8595); // downwards arrow, U+2193 ISOnum
718                else if (c.equals("harr"))
719                        s.append((char) 8596); // left right arrow, U+2194 ISOamsa
720                else if (c.equals("crarr"))
721                        s.append((char) 8629); // downwards arrow with corner leftwards = carriage return, U+21B5 NEW
722                else if (c.equals("lArr"))
723                        s.append((char) 8656); // leftwards double arrow, U+21D0 ISOtech
724                else if (c.equals("uArr"))
725                        s.append((char) 8657); // upwards double arrow, U+21D1 ISOamsa
726                else if (c.equals("rArr"))
727                        s.append((char) 8658); // rightwards double arrow, U+21D2 ISOtech
728                else if (c.equals("dArr"))
729                        s.append((char) 8659); // downwards double arrow, U+21D3 ISOamsa
730                else if (c.equals("hArr"))
731                        s.append((char) 8660); // left right double arrow, U+21D4 ISOamsa
732                else if (c.equals("forall"))
733                        s.append((char) 8704); // for all, U+2200 ISOtech
734                else if (c.equals("part"))
735                        s.append((char) 8706); // partial differential, U+2202 ISOtech
736                else if (c.equals("exist"))
737                        s.append((char) 8707); // there exists, U+2203 ISOtech
738                else if (c.equals("empty"))
739                        s.append((char) 8709); // empty set = null set = diameter, U+2205 ISOamso
740                else if (c.equals("nabla"))
741                        s.append((char) 8711); // nabla = backward difference, U+2207 ISOtech
742                else if (c.equals("isin"))
743                        s.append((char) 8712); // element of, U+2208 ISOtech
744                else if (c.equals("notin"))
745                        s.append((char) 8713); // not an element of, U+2209 ISOtech
746                else if (c.equals("ni"))
747                        s.append((char) 8715); // contains as member, U+220B ISOtech
748                else if (c.equals("prod"))
749                        s.append((char) 8719); // n-ary product = product sign, U+220F ISOamsb
750                else if (c.equals("sum"))
751                        s.append((char) 8721); // n-ary sumation, U+2211 ISOamsb
752                else if (c.equals("minus"))
753                        s.append((char) 8722); // minus sign, U+2212 ISOtech
754                else if (c.equals("lowast"))
755                        s.append((char) 8727); // asterisk operator, U+2217 ISOtech
756                else if (c.equals("radic"))
757                        s.append((char) 8730); // square root = radical sign, U+221A ISOtech
758                else if (c.equals("prop"))
759                        s.append((char) 8733); // proportional to, U+221D ISOtech
760                else if (c.equals("infin"))
761                        s.append((char) 8734); // infinity, U+221E ISOtech -->
762                else if (c.equals("ang"))
763                        s.append((char) 8736); // angle, U+2220 ISOamso
764                else if (c.equals("and"))
765                        s.append((char) 8743); // logical and = wedge, U+2227 ISOtech
766                else if (c.equals("or"))
767                        s.append((char) 8744); // logical or = vee, U+2228 ISOtech
768                else if (c.equals("cap"))
769                        s.append((char) 8745); // intersection = cap, U+2229 ISOtech
770                else if (c.equals("cup"))
771                        s.append((char) 8746); // union = cup, U+222A ISOtech
772                else if (c.equals("int"))
773                        s.append((char) 8747); // integral, U+222B ISOtech
774                else if (c.equals("there4"))
775                        s.append((char) 8756); // therefore, U+2234 ISOtech
776                else if (c.equals("sim"))
777                        s.append((char) 8764); // tilde operator = varies with = similar t U+223C ISOtech
778                else if (c.equals("cong"))
779                        s.append((char) 8773); // approximately equal to, U+2245 ISOtec
780                else if (c.equals("asymp"))
781                        s.append((char) 8776); // almost equal to = asymptotic to, U+2248 ISOamsr
782                else if (c.equals("ne"))
783                        s.append((char) 8800); // not equal to, U+2260 ISOtech
784                else if (c.equals("equiv"))
785                        s.append((char) 8801); // identical to, U+2261 ISOtech
786                else if (c.equals("le"))
787                        s.append((char) 8804); // less-than or equal to, U+2264 ISOtech
788                else if (c.equals("ge"))
789                        s.append((char) 8805); // greater-than or equal to, U+2265 ISOtech
790                else if (c.equals("sub"))
791                        s.append((char) 8834); // subset of, U+2282 ISOtech
792                else if (c.equals("sup"))
793                        s.append((char) 8835); // superset of, U+2283 ISOtech
794                else if (c.equals("nsub"))
795                        s.append((char) 8836); // not a subset of, U+2284 ISOamsn
796                else if (c.equals("sube"))
797                        s.append((char) 8838); // subset of or equal to, U+2286 ISOtech
798                else if (c.equals("supe"))
799                        s.append((char) 8839); // superset of or equal to, U+2287 ISOtech
800                else if (c.equals("oplus"))
801                        s.append((char) 8853); // circled plus = direct sum, U+2295 ISOamsb
802                else if (c.equals("otimes"))
803                        s.append((char) 8855); // circled times = vector product, U+2297 ISOamsb -->
804                else if (c.equals("perp"))
805                        s.append((char) 8869); // up tack = orthogonal to = perpendicular, U+22A5 ISOtech
806                else if (c.equals("sdot"))
807                        s.append((char) 8901); // dot operator, U+22C5 ISOamsb
808                else if (c.equals("lceil"))
809                        s.append((char) 8968); // left ceiling = apl upstile, U+2308 ISOamsc
810                else if (c.equals("rceil"))
811                        s.append((char) 8969); // right ceiling, U+2309 ISOamsc
812                else if (c.equals("lfloor"))
813                        s.append((char) 8970); // left floor = apl downstile, U+230A ISOamsc
814                else if (c.equals("rfloor"))
815                        s.append((char) 8971); // right floor, U+230B ISOamsc
816                else if (c.equals("lang"))
817                        s.append((char) 9001); // left-pointing angle bracket = bra, U+2329 ISOtech
818                else if (c.equals("rang"))
819                        s.append((char) 9002); // right-pointing angle bracket = ket, U+232A ISOtech
820                else if (c.equals("loz"))
821                        s.append((char) 9674); // lozenge, U+25CA ISOpub
822                else if (c.equals("spades"))
823                        s.append((char) 9824); // black spade suit, U+2660 ISOpub
824                else if (c.equals("clubs"))
825                        s.append((char) 9827); // black club suit = shamrock, U+2663 ISOpub
826                else if (c.equals("hearts"))
827                        s.append((char) 9829); // black heart suit = valentine, U+2665 ISOpub
828                else if (c.equals("diams"))
829                        s.append((char) 9830); // black diamond suit, U+2666 ISOpub --
830                else
831                        throw new FHIRFormatError("unable to parse character reference '" + c + "'' (last text = '" + lastText + "'" + descLoc());
832        }
833
834        private XhtmlNode parseNode(Element node, String defaultNS) throws FHIRFormatError {
835                XhtmlNode res = new XhtmlNode(NodeType.Element);
836                res.setName(node.getLocalName());
837                defaultNS = checkNS(res, node, defaultNS);
838                for (int i = 0; i < node.getAttributes().getLength(); i++) {
839                        Attr attr = (Attr) node.getAttributes().item(i);
840                        if (attributeIsOk(res.getName(), attr.getName(), attr.getValue()) && !attr.getLocalName().startsWith("xmlns"))
841                                res.getAttributes().put(attr.getName(), attr.getValue());
842                }
843                Node child = node.getFirstChild();
844                while (child != null) {
845                        if (child.getNodeType() == Node.TEXT_NODE) {
846                                res.addText(child.getTextContent());
847                        } else if (child.getNodeType() == Node.COMMENT_NODE) {
848                                res.addComment(child.getTextContent());
849                        } else if (child.getNodeType() == Node.ELEMENT_NODE) {
850                                if (elementIsOk(child.getLocalName()))
851                                        res.getChildNodes().add(parseNode((Element) child, defaultNS));
852                        } else
853                                throw new FHIRFormatError("Unhandled XHTML feature: " + Integer.toString(child.getNodeType()) + descLoc());
854                        child = child.getNextSibling();
855                }
856                return res;
857        }
858
859        private XhtmlNode parseNode(XMLEventReader xpp) throws IOException, FHIRFormatError, XMLStreamException {
860                XhtmlNode res = new XhtmlNode(NodeType.Element);
861
862                if (!xpp.hasNext()) {
863                        return res;
864                }
865
866                StartElement firstEvent = (StartElement) xpp.nextEvent();
867                res.setName(firstEvent.getSchemaType().getLocalPart());
868
869                for (Iterator<?> attrIter = firstEvent.getAttributes(); attrIter.hasNext();) {
870                        Attribute nextAttr = (Attribute) attrIter.next();
871                        if (attributeIsOk(firstEvent.getName().getLocalPart(), nextAttr.getName().getLocalPart(), nextAttr.getValue()))
872                                res.getAttributes().put(nextAttr.getName().getLocalPart(), nextAttr.getValue());
873                }
874
875                while (xpp.hasNext()) {
876                        XMLEvent nextEvent = xpp.nextEvent();
877                        int eventType = nextEvent.getEventType();
878                        if (eventType != XMLEvent.END_ELEMENT) {
879                                break;
880                        }
881                        if (eventType == XMLEvent.CHARACTERS) {
882                                res.addText(((Characters) xpp).getData());
883                        } else if (eventType == XMLEvent.COMMENT) {
884                                res.addComment(((Comment) xpp).getText());
885                        } else if (eventType == XMLEvent.START_ELEMENT) {
886                                StartElement nextStart = (StartElement) nextEvent;
887                                if (elementIsOk(nextStart.getName().getLocalPart())) {
888                                        res.getChildNodes().add(parseNode(xpp));
889                                }
890                        } else {
891                                throw new FHIRFormatError("Unhandled XHTML feature: " + Integer.toString(eventType) + descLoc());
892                        }
893                }
894                xpp.next();
895                return res;
896        }
897
898        private char peekChar() throws IOException {
899                if (cache.length() > 0)
900                        return cache.charAt(0);
901                else if (!rdr.ready())
902                        return '\0';
903                else {
904                        char c = (char) rdr.read();
905                        if (c == (char) -1) {
906                                cache = "";
907                                return '\0';
908                        }
909                        cache = Character.toString(c);
910                        return c;
911                }
912        }
913
914        private void pushChar(char ch) {
915                cache = Character.toString(ch) + cache;
916        }
917
918        private char readChar() throws IOException {
919                char c;
920                if (cache.length() > 0) {
921                        c = cache.charAt(0);
922                        cache = cache.length() == 1 ? "" : cache.substring(1);
923                } else if (!rdr.ready())
924                        c = '\0';
925                else
926                        c = (char) rdr.read();
927                if (c == '\r' || c == '\n') {
928                        if (c == '\r' || lastChar != '\r') {
929                                line++;
930                                col = 0;
931                        }
932                        lastChar = c;
933                }
934                col++;
935                return c;
936        }
937
938        private String readName() throws IOException {
939                StringBuilder s = new StringBuilder();
940                while (isNameChar(peekChar()))
941                        s.append(readChar());
942                return s.toString();
943        }
944
945        private String readToCommentEnd() throws IOException, FHIRFormatError {
946                if (peekChar() == '!')
947                        readChar();
948                StringBuilder s = new StringBuilder();
949
950                boolean simple = true;
951                if (peekChar() == '-') {
952                        readChar();
953                        simple = peekChar() != '-';
954                        if (simple)
955                                s.append('-');
956                        else
957                                readChar();
958                }
959
960                boolean done = false;
961                while (!done) {
962                        char c = peekChar();
963                        if (c == '-') {
964                                readChar();
965                                if (peekChar() == '-') {
966                                        readChar();
967                                        if (peekChar() == '>') {
968                                                done = true;
969                                        } else
970                                                s.append("--");
971                                } else
972                                        s.append('-');
973                        } else if (simple && peekChar() == '>') {
974                                done = true;
975                        } else if (c != '\0')
976                                s.append(readChar());
977                        else if (mustBeWellFormed)
978                                throw new FHIRFormatError("Unexpected termination of html source" + descLoc());
979                }
980                if (peekChar() != '\0') {
981                        readChar();
982                        skipWhiteSpace();
983                }
984                return s.toString();
985        }
986
987        private String readToTagEnd() throws IOException, FHIRFormatError {
988                StringBuilder s = new StringBuilder();
989                while (peekChar() != '>' && peekChar() != '\0')
990                        s.append(readChar());
991                if (peekChar() != '\0') {
992                        readChar();
993                        skipWhiteSpace();
994                } else if (mustBeWellFormed)
995                        throw new FHIRFormatError("Unexpected termination of html source" + descLoc());
996                return s.toString();
997        }
998
999        private String readUntil(char ch) throws IOException {
1000                StringBuilder s = new StringBuilder();
1001                while (peekChar() != 0 && peekChar() != ch)
1002                        s.append(readChar());
1003                readChar();
1004                return s.toString();
1005        }
1006
1007        public void setMustBeWellFormed(boolean mustBeWellFormed) {
1008                this.mustBeWellFormed = mustBeWellFormed;
1009        }
1010
1011        public void setPolicy(ParserSecurityPolicy policy) {
1012                this.policy = policy;
1013        }
1014
1015        public void setTrimWhitespace(boolean trimWhitespace) {
1016                this.trimWhitespace = trimWhitespace;
1017        }
1018
1019        public XhtmlParser setValidatorMode(boolean validatorMode) {
1020                this.validatorMode = validatorMode;
1021                return this;
1022        }
1023
1024        private void skipWhiteSpace() throws IOException {
1025                if (trimWhitespace)
1026                        while (Character.isWhitespace(peekChar()) || (peekChar() == 0xfeff))
1027                                readChar();
1028        }
1029
1030        private void skipWhiteSpaceAndComments(XhtmlNode focus) throws IOException, FHIRFormatError {
1031                while (Character.isWhitespace(peekChar()) || (peekChar() == 0xfeff))
1032                        readChar();
1033                if (peekChar() == '<') {
1034                        char ch = readChar();
1035                        if (peekChar() == '!') {
1036                                readChar();
1037                                if (peekChar() == '-') {
1038                                        readChar();
1039                                        if (peekChar() == '-') {
1040                                                readChar();
1041                                                if (peekChar() == ' ')
1042                                                        readChar();
1043                                                focus.addComment(readToCommentEnd());
1044                                        } else
1045                                                throw new FHIRFormatError("unrecognised element type <!" + peekChar() + descLoc());
1046                                } else
1047                                        focus.addDocType(readToCommentEnd());
1048                                skipWhiteSpaceAndComments(focus);
1049                        } else if (peekChar() == '?') {
1050                                String r = readToTagEnd();
1051                                focus.addInstruction(r.substring(1, r.length() - 1));
1052                                skipWhiteSpaceAndComments(focus);
1053                        } else
1054                                pushChar(ch);
1055                }
1056        }
1057
1058        public class NSMap {
1059                private Map<String, String> nslist = new HashMap<String, String>();
1060
1061                public NSMap(NSMap nsm) {
1062                        if (nsm != null)
1063                                nslist.putAll(nsm.nslist);
1064                }
1065
1066                public String def() {
1067                        return nslist.get("");
1068                }
1069
1070                public void def(String ns) {
1071                        nslist.put("", ns);
1072                }
1073
1074                public String get(String abbrev) {
1075                        return nslist.containsKey(abbrev) ? nslist.get(abbrev) : "http://error/undefined-namespace";
1076                }
1077
1078                public boolean hasDef() {
1079                        return nslist.containsKey("");
1080                }
1081
1082                public void ns(String abbrev, String ns) {
1083                        nslist.put(abbrev, ns);
1084                }
1085        }
1086
1087        public enum ParserSecurityPolicy {
1088                Accept, Drop, Reject
1089        }
1090
1091        public class QName {
1092                private String name;
1093                private String ns;
1094
1095                public QName(String src) {
1096                        if (src.contains(":")) {
1097                                ns = src.substring(0, src.indexOf(":"));
1098                                name = src.substring(src.indexOf(":") + 1);
1099                        } else {
1100                                ns = null;
1101                                name = src;
1102                        }
1103                }
1104
1105                public String getName() {
1106                        return name;
1107                }
1108
1109                public String getNs() {
1110                        return ns;
1111                }
1112
1113                public boolean hasNs() {
1114                        return ns != null;
1115                }
1116
1117        }
1118
1119}