001/* 002 * Copyright (c) 2011+, HL7, Inc 003 * All rights reserved. 004 * 005 * Redistribution and use in source and binary forms, with or without modification, 006 * are permitted provided that the following conditions are met: 007 * 008 * Redistributions of source code must retain the above copyright notice, this 009 * list of conditions and the following disclaimer. 010 * Redistributions in binary form must reproduce the above copyright notice, 011 * this list of conditions and the following disclaimer in the documentation 012 * and/or other materials provided with the distribution. 013 * Neither the name of HL7 nor the names of its contributors may be used to 014 * endorse or promote products derived from this software without specific 015 * prior written permission. 016 * 017 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 018 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 019 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 020 * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, 021 * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 022 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 023 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 024 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 025 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 026 * POSSIBILITY OF SUCH DAMAGE. 027 * 028 */ 029package org.hl7.fhir.utilities.xhtml; 030 031/* 032 * #%L 033 * HAPI FHIR - Core Library 034 * %% 035 * Copyright (C) 2014 - 2017 University Health Network 036 * %% 037 * Licensed under the Apache License, Version 2.0 (the "License"); 038 * you may not use this file except in compliance with the License. 039 * You may obtain a copy of the License at 040 * 041 * http://www.apache.org/licenses/LICENSE-2.0 042 * 043 * Unless required by applicable law or agreed to in writing, software 044 * distributed under the License is distributed on an "AS IS" BASIS, 045 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 046 * See the License for the specific language governing permissions and 047 * limitations under the License. 048 * #L% 049 */ 050 051import java.io.*; 052import java.util.*; 053 054import javax.xml.stream.XMLEventReader; 055import javax.xml.stream.XMLStreamException; 056import javax.xml.stream.events.Attribute; 057import javax.xml.stream.events.Characters; 058import javax.xml.stream.events.Comment; 059import javax.xml.stream.events.StartElement; 060import javax.xml.stream.events.XMLEvent; 061 062import org.hl7.fhir.exceptions.FHIRException; 063import org.hl7.fhir.exceptions.FHIRFormatError; 064import org.w3c.dom.Attr; 065import org.w3c.dom.Element; 066import org.w3c.dom.Node; 067 068public class XhtmlParser { 069 public static final String XHTML_NS = "http://www.w3.org/1999/xhtml"; 070 071 private Set<String> attributes = new HashSet<String>(); 072 073 private String cache = ""; 074 075 private int col = 0; 076 private Set<String> elements = new HashSet<String>(); 077 078 private char lastChar; 079 080 private String lastText = ""; 081 082 private int line = 1; 083 084 private boolean mustBeWellFormed = true; 085 private ParserSecurityPolicy policy; 086 private Reader rdr; 087 088 private boolean trimWhitespace; 089 090 private XhtmlNode unwindPoint; 091 092 private boolean validatorMode; 093 094 public XhtmlParser() { 095 super(); 096 policy = ParserSecurityPolicy.Accept; // for general parsing 097 098 // set up sets 099 elements.add("p"); 100 elements.add("br"); 101 elements.add("div"); 102 elements.add("h1"); 103 elements.add("h2"); 104 elements.add("h3"); 105 elements.add("h4"); 106 elements.add("h5"); 107 elements.add("h6"); 108 elements.add("a"); 109 elements.add("span"); 110 elements.add("b"); 111 elements.add("em"); 112 elements.add("i"); 113 elements.add("strong"); 114 elements.add("small"); 115 elements.add("big"); 116 elements.add("tt"); 117 elements.add("small"); 118 elements.add("dfn"); 119 elements.add("q"); 120 elements.add("var"); 121 elements.add("abbr"); 122 elements.add("acronym"); 123 elements.add("cite"); 124 elements.add("blockquote"); 125 elements.add("hr"); 126 elements.add("address"); 127 elements.add("bdo"); 128 elements.add("kbd"); 129 elements.add("q"); 130 elements.add("sub"); 131 elements.add("sup"); 132 elements.add("ul"); 133 elements.add("ol"); 134 elements.add("li"); 135 elements.add("dl"); 136 elements.add("dt"); 137 elements.add("dd"); 138 elements.add("pre"); 139 elements.add("table"); 140 elements.add("caption"); 141 elements.add("colgroup"); 142 elements.add("col"); 143 elements.add("thead"); 144 elements.add("tr"); 145 elements.add("tfoot"); 146 elements.add("tbody"); 147 elements.add("th"); 148 elements.add("td"); 149 elements.add("code"); 150 elements.add("samp"); 151 elements.add("img"); 152 elements.add("map"); 153 elements.add("area"); 154 155 attributes.add("title"); 156 attributes.add("style"); 157 attributes.add("class"); 158 attributes.add("id"); 159 attributes.add("lang"); 160 attributes.add("xml:lang"); 161 attributes.add("dir"); 162 attributes.add("accesskey"); 163 attributes.add("tabindex"); 164 // tables: 165 attributes.add("span"); 166 attributes.add("width"); 167 attributes.add("align"); 168 attributes.add("valign"); 169 attributes.add("char"); 170 attributes.add("charoff"); 171 attributes.add("abbr"); 172 attributes.add("axis"); 173 attributes.add("headers"); 174 attributes.add("scope"); 175 attributes.add("rowspan"); 176 attributes.add("colspan"); 177 178 attributes.add("a.href"); 179 attributes.add("a.name"); 180 attributes.add("img.src"); 181 attributes.add("img.border"); 182 attributes.add("div.xmlns"); 183 attributes.add("blockquote.cite"); 184 attributes.add("q.cite"); 185 attributes.add("a.charset"); 186 attributes.add("a.type"); 187 attributes.add("a.name"); 188 attributes.add("a.href"); 189 attributes.add("a.hreflang"); 190 attributes.add("a.rel"); 191 attributes.add("a.rev"); 192 attributes.add("a.shape"); 193 attributes.add("a.coords"); 194 attributes.add("img.src"); 195 attributes.add("img.alt"); 196 attributes.add("img.longdesc"); 197 attributes.add("img.height"); 198 attributes.add("img.width"); 199 attributes.add("img.usemap"); 200 attributes.add("img.ismap"); 201 attributes.add("map.name"); 202 attributes.add("area.shape"); 203 attributes.add("area.coords"); 204 attributes.add("area.href"); 205 attributes.add("area.nohref"); 206 attributes.add("area.alt"); 207 attributes.add("table.summary"); 208 attributes.add("table.width"); 209 attributes.add("table.border"); 210 attributes.add("table.frame"); 211 attributes.add("table.rules"); 212 attributes.add("table.cellspacing"); 213 attributes.add("table.cellpadding"); 214 } 215 216 private void addTextNode(XhtmlNode node, StringBuilder s) { 217 String t = isTrimWhitespace() ? s.toString().trim() : s.toString(); 218 if (t.length() > 0) { 219 lastText = t; 220 // System.out.println(t); 221 node.addText(t); 222 s.setLength(0); 223 } 224 } 225 226 private boolean attributeIsOk(String elem, String attr, String value) throws FHIRFormatError { 227 if (validatorMode) 228 return true; 229 boolean ok = attributes.contains(attr) || attributes.contains(elem + "." + attr); 230 if (ok) { 231 return true; 232 } 233 switch (policy) { 234 case Accept: 235 return true; 236 case Drop: 237 return false; 238 case Reject: 239 throw new FHIRFormatError("Illegal HTML attribute " + elem + "." + attr); 240 } 241 242 if ((elem + "." + attr).equals("img.src") && !(value.startsWith("#") || value.startsWith("http:") || value.startsWith("https:"))) { 243 switch (policy) { 244 case Accept: 245 return true; 246 case Drop: 247 return false; 248 case Reject: 249 throw new FHIRFormatError("Illegal Image Reference " + value); 250 } 251 } 252 return false; 253 } 254 255 private NSMap checkNamespaces(QName n, XhtmlNode node, NSMap nsm, boolean root) { 256 // what we do here is strip out any stated namespace attributes, putting them in the namesapce map 257 // then we figure out what the namespace of this element is, and state it explicitly if it's not the default 258 259 // but we don't bother with any of this if we're not validating 260 if (!validatorMode) 261 return null; 262 NSMap result = new NSMap(nsm); 263 List<String> nsattrs = new ArrayList<String>(); 264 for (String an : node.getAttributes().keySet()) { 265 if (an.equals("xmlns")) { 266 result.def(node.getAttribute(an)); 267 nsattrs.add(an); 268 } 269 if (an.startsWith("xmlns:")) { 270 result.ns(an.substring(6), node.getAttribute(an)); 271 nsattrs.add(an); 272 } 273 } 274 for (String s : nsattrs) 275 node.getAttributes().remove(s); 276 if (n.hasNs()) { 277 String nns = result.get(n.getNs()); 278 if (!nns.equals(result.def())) { 279 node.getAttributes().put("xmlns", nns); 280 result.def(nns); 281 } 282 } else if (root && result.hasDef()) { 283 node.getAttributes().put("xmlns", result.def()); 284 } 285 return result; 286 } 287 288 private String checkNS(XhtmlNode res, Element node, String defaultNS) { 289 if (!validatorMode) 290 return null; 291 String ns = node.getNamespaceURI(); 292 if (ns == null) 293 return null; 294 if (!ns.equals(defaultNS)) { 295 res.getAttributes().put("xmlns", ns); 296 return ns; 297 } 298 return defaultNS; 299 } 300 301 private String descLoc() { 302 return " at line " + Integer.toString(line) + " column " + Integer.toString(col); 303 } 304 305 private boolean elementIsOk(String name) throws FHIRFormatError { 306 if (validatorMode) 307 return true; 308 boolean ok = elements.contains(name); 309 if (ok){ 310 return true; 311 } 312 switch (policy) { 313 case Accept: 314 return true; 315 case Drop: 316 return false; 317 case Reject: 318 throw new FHIRFormatError("Illegal HTML element " + name); 319 } 320 return false; 321 } 322 323 public ParserSecurityPolicy getPolicy() { 324 return policy; 325 } 326 327 private boolean isInteger(String s, int base) { 328 try { 329 Integer.parseInt(s, base); 330 return true; 331 } catch (Exception e) { 332 return false; 333 } 334 } 335 336 public boolean isMustBeWellFormed() { 337 return mustBeWellFormed; 338 } 339 340 private boolean isNameChar(char ch) { 341 return Character.isLetterOrDigit(ch) || ch == '_' || ch == '-' || ch == ':'; 342 } 343 344 public boolean isTrimWhitespace() { 345 return trimWhitespace; 346 } 347 348 public boolean isValidatorMode() { 349 return validatorMode; 350 } 351 352 public XhtmlDocument parse(InputStream input, String entryName) throws FHIRFormatError, IOException { 353 rdr = new InputStreamReader(input, "UTF-8"); 354 return parse(entryName); 355 } 356 357 private XhtmlDocument parse(String entryName) throws FHIRFormatError, IOException { 358 XhtmlDocument result = new XhtmlDocument(); 359 skipWhiteSpaceAndComments(result); 360 if (peekChar() != '<') 361 throw new FHIRFormatError("Unable to Parse HTML - does not start with tag. Found " + peekChar() + descLoc()); 362 readChar(); 363 QName n = new QName(readName().toLowerCase()); 364 if ((entryName != null) && !n.getName().equals(entryName)) 365 throw new FHIRFormatError("Unable to Parse HTML - starts with '" + n + "' not '" + entryName + "'" + descLoc()); 366 XhtmlNode root = result.addTag(n.getName()); 367 parseAttributes(root); 368 NSMap nsm = checkNamespaces(n, root, null, true); 369 if (readChar() == '/') { 370 if (peekChar() != '>') 371 throw new FHIRFormatError("unexpected non-end of element " + n + " " + descLoc()); 372 readChar(); 373 } else { 374 unwindPoint = null; 375 List<XhtmlNode> p = new ArrayList<XhtmlNode>(); 376 parseElementInner(root, p, nsm); 377 } 378 return result; 379 } 380 381 public XhtmlDocument parse(String source, String entryName) throws FHIRFormatError, IOException { 382 rdr = new StringReader(source); 383 return parse(entryName); 384 } 385 386 private void parseAttributes(XhtmlNode node) throws FHIRFormatError, IOException { 387 while (Character.isWhitespace(peekChar())) 388 readChar(); 389 while (peekChar() != '>' && peekChar() != '/' && peekChar() != '\0') { 390 String name = readName(); 391 if (name.length() == 0) { 392 throw new FHIRFormatError("Unable to read attribute on <" + node.getName() + ">" + descLoc()); 393 } 394 while (Character.isWhitespace(peekChar())) 395 readChar(); 396 397 if (isNameChar(peekChar()) || peekChar() == '>' || peekChar() == '/') 398 node.getAttributes().put(name, null); 399 else if (peekChar() != '=') { 400 throw new FHIRFormatError("Unable to read attribute '" + name + "' value on <" + node.getName() + ">" + descLoc()); 401 } else { 402 readChar(); 403 while (Character.isWhitespace(peekChar())) 404 readChar(); 405 if (peekChar() == '"' || peekChar() == '\'') 406 node.getAttributes().put(name, parseAttributeValue(readChar())); 407 else 408 node.getAttributes().put(name, parseAttributeValue('\0')); 409 } 410 while (Character.isWhitespace(peekChar())) 411 readChar(); 412 } 413 } 414 415 private String parseAttributeValue(char term) throws IOException, FHIRFormatError { 416 StringBuilder b = new StringBuilder(); 417 while (peekChar() != '\0' && peekChar() != '>' && (term != '\0' || peekChar() != '/') && peekChar() != term) { 418 if (peekChar() == '&') { 419 parseLiteral(b); 420 } else 421 b.append(readChar()); 422 } 423 if (peekChar() == term) 424 readChar(); 425 return b.toString(); 426 } 427 428 private void parseElement(XhtmlNode parent, List<XhtmlNode> parents, NSMap nsm) throws IOException, FHIRFormatError { 429 QName name = new QName(readName()); 430 XhtmlNode node = parent.addTag(name.getName()); 431 List<XhtmlNode> newParents = new ArrayList<XhtmlNode>(); 432 newParents.addAll(parents); 433 newParents.add(parent); 434 parseAttributes(node); 435 nsm = checkNamespaces(name, node, nsm, false); 436 if (readChar() == '/') { 437 if (peekChar() != '>') 438 throw new FHIRFormatError("unexpected non-end of element " + name + " " + descLoc()); 439 readChar(); 440 } else { 441 parseElementInner(node, newParents, nsm); 442 } 443 } 444 445 private void parseElementInner(XhtmlNode node, List<XhtmlNode> parents, NSMap nsm) throws FHIRFormatError, IOException { 446 StringBuilder s = new StringBuilder(); 447 while (peekChar() != '\0' && !parents.contains(unwindPoint) && !(node == unwindPoint)) { 448 if (peekChar() == '<') { 449 addTextNode(node, s); 450 readChar(); 451 if (peekChar() == '!') { 452 String sc = readToCommentEnd(); 453 if (sc.startsWith("DOCTYPE")) 454 throw new FHIRFormatError("Malformed XHTML: Found a DocType declaration, and these are not allowed (XXE security vulnerability protection)"); 455 node.addComment(sc); 456 } else if (peekChar() == '?') 457 node.addComment(readToTagEnd()); 458 else if (peekChar() == '/') { 459 readChar(); 460 QName n = new QName(readToTagEnd()); 461 if (node.getName().equals(n.getName())){ 462 return; 463 } 464 if (mustBeWellFormed) 465 throw new FHIRFormatError("Malformed XHTML: Found \"</" + n.getName() + ">\" expecting \"</" + node.getName() + ">\"" + descLoc()); 466 for (int i = parents.size() - 1; i >= 0; i--) { 467 if (parents.get(i).getName().equals(n)) 468 unwindPoint = parents.get(i); 469 } 470 if (unwindPoint != null) { 471 for (int i = parents.size(); i > 0; i--) { 472 if (i < parents.size() && parents.get(i) == unwindPoint) 473 return; 474 if (i == parents.size()) { 475 parents.get(i - 1).getChildNodes().addAll(node.getChildNodes()); 476 node.getChildNodes().clear(); 477 } else { 478 parents.get(i - 1).getChildNodes().addAll(parents.get(i).getChildNodes()); 479 parents.get(i).getChildNodes().clear(); 480 } 481 } 482 } 483 } else if (Character.isLetterOrDigit(peekChar())) { 484 parseElement(node, parents, nsm); 485 } else 486 throw new FHIRFormatError("Unable to Parse HTML - node '" + node.getName() + "' has unexpected content '" + peekChar() + "' (last text = '" + lastText + "'" + descLoc()); 487 } else if (peekChar() == '&') { 488 parseLiteral(s); 489 } else 490 s.append(readChar()); 491 } 492 addTextNode(node, s); 493 } 494 495 private XhtmlNode parseFragment() throws IOException, FHIRException { 496 skipWhiteSpace(); 497 if (peekChar() != '<') 498 throw new FHIRException("Unable to Parse HTML - does not start with tag. Found " + peekChar() + descLoc()); 499 readChar(); 500 if (peekChar() == '?') { 501 readToTagEnd(); 502 skipWhiteSpace(); 503 if (peekChar() != '<') 504 throw new FHIRException("Unable to Parse HTML - does not start with tag after processing instruction. Found " + peekChar() + descLoc()); 505 readChar(); 506 } 507 String n = readName().toLowerCase(); 508 readToTagEnd(); 509 XhtmlNode result = new XhtmlNode(NodeType.Element); 510 511 int colonIndex = n.indexOf(':'); 512 if (colonIndex != -1) { 513 n = n.substring(colonIndex + 1); 514 } 515 516 result.setName(n); 517 unwindPoint = null; 518 List<XhtmlNode> p = new ArrayList<XhtmlNode>(); 519 parseElementInner(result, p, null); 520 521 return result; 522 } 523 524 public XhtmlNode parseFragment(InputStream input) throws IOException, FHIRException { 525 rdr = new InputStreamReader(input); 526 return parseFragment(); 527 } 528 529 public XhtmlNode parseFragment(String source) throws IOException, FHIRException { 530 rdr = new StringReader(source); 531 return parseFragment(); 532 } 533 534 public XhtmlNode parseHtmlNode(Element node) throws FHIRFormatError { 535 return parseHtmlNode(node, null); 536 } 537 538 public XhtmlNode parseHtmlNode(Element node, String defaultNS) throws FHIRFormatError { 539 XhtmlNode res = parseNode(node, defaultNS); 540 if (res.getNsDecl() == null) 541 res.getAttributes().put("xmlns", XHTML_NS); 542 return res; 543 } 544 545 public XhtmlNode parseHtmlNode(XMLEventReader xpp) throws IOException, FHIRFormatError, XMLStreamException { 546 XhtmlNode res = parseNode(xpp); 547 if (res.getNsDecl() == null) 548 res.getAttributes().put("xmlns", XHTML_NS); 549 return res; 550 551 } 552 553 private void parseLiteral(StringBuilder s) throws IOException, FHIRFormatError { 554 // UInt16 w; 555 readChar(); 556 String c = readUntil(';'); 557 if (c.equals("apos")) 558 s.append('\''); 559 else if (c.equals("quot")) 560 s.append('"'); 561 else if (c.equals("nbsp")) 562 s.append(XhtmlNode.NBSP); 563 else if (c.equals("amp")) 564 s.append('&'); 565 else if (c.equals("rsquo")) 566 s.append('’'); 567 else if (c.equals("gt")) 568 s.append('>'); 569 else if (c.equals("lt")) 570 s.append('<'); 571 else if (c.equals("copy")) 572 s.append((char) 169); 573 else if (c.equals("reg")) 574 s.append((char) 174); 575 else if (c.equals("sect")) 576 s.append((char) 0xA7); 577 else if (c.charAt(0) == '#') { 578 if (isInteger(c.substring(1), 10)) 579 s.append((char) Integer.parseInt(c.substring(1))); 580 else if (c.charAt(1) == 'x' && isInteger(c.substring(2), 16)) 581 s.append((char) Integer.parseInt(c.substring(2), 16)); 582 } else if (c.equals("fnof")) 583 s.append((char) 402); // latin small f with hook = function = florin, U+0192 ISOtech --> 584 else if (c.equals("Alpha")) 585 s.append((char) 913); // greek capital letter alpha, U+0391 586 else if (c.equals("Beta")) 587 s.append((char) 914); // greek capital letter beta, U+0392 588 else if (c.equals("Gamma")) 589 s.append((char) 915); // greek capital letter gamma, U+0393 ISOgrk3 590 else if (c.equals("Delta")) 591 s.append((char) 916); // greek capital letter delta, U+0394 ISOgrk3 592 else if (c.equals("Epsilon")) 593 s.append((char) 917); // greek capital letter epsilon, U+0395 594 else if (c.equals("Zeta")) 595 s.append((char) 918); // greek capital letter zeta, U+0396 596 else if (c.equals("Eta")) 597 s.append((char) 919); // greek capital letter eta, U+0397 598 else if (c.equals("Theta")) 599 s.append((char) 920); // greek capital letter theta, U+0398 ISOgrk3 600 else if (c.equals("Iota")) 601 s.append((char) 921); // greek capital letter iota, U+0399 602 else if (c.equals("Kappa")) 603 s.append((char) 922); // greek capital letter kappa, U+039A 604 else if (c.equals("Lambda")) 605 s.append((char) 923); // greek capital letter lambda, U+039B ISOgrk3 606 else if (c.equals("Mu")) 607 s.append((char) 924); // greek capital letter mu, U+039C 608 else if (c.equals("Nu")) 609 s.append((char) 925); // greek capital letter nu, U+039D 610 else if (c.equals("Xi")) 611 s.append((char) 926); // greek capital letter xi, U+039E ISOgrk3 612 else if (c.equals("Omicron")) 613 s.append((char) 927); // greek capital letter omicron, U+039F 614 else if (c.equals("Pi")) 615 s.append((char) 928); // greek capital letter pi, U+03A0 ISOgrk3 616 else if (c.equals("Rho")) 617 s.append((char) 929); // greek capital letter rho, U+03A1 618 else if (c.equals("Sigma")) 619 s.append((char) 931); // greek capital letter sigma, U+03A3 ISOgrk3 620 else if (c.equals("Tau")) 621 s.append((char) 932); // greek capital letter tau, U+03A4 622 else if (c.equals("Upsilon")) 623 s.append((char) 933); // greek capital letter upsilon, U+03A5 ISOgrk3 624 else if (c.equals("Phi")) 625 s.append((char) 934); // greek capital letter phi, U+03A6 ISOgrk3 626 else if (c.equals("Chi")) 627 s.append((char) 935); // greek capital letter chi, U+03A7 628 else if (c.equals("Psi")) 629 s.append((char) 936); // greek capital letter psi, U+03A8 ISOgrk3 630 else if (c.equals("Omega")) 631 s.append((char) 937); // greek capital letter omega, U+03A9 ISOgrk3 632 else if (c.equals("alpha")) 633 s.append((char) 945); // greek small letter alpha, U+03B1 ISOgrk3 634 else if (c.equals("beta")) 635 s.append((char) 946); // greek small letter beta, U+03B2 ISOgrk3 636 else if (c.equals("gamma")) 637 s.append((char) 947); // greek small letter gamma, U+03B3 ISOgrk3 638 else if (c.equals("delta")) 639 s.append((char) 948); // greek small letter delta, U+03B4 ISOgrk3 640 else if (c.equals("epsilon")) 641 s.append((char) 949); // greek small letter epsilon, U+03B5 ISOgrk3 642 else if (c.equals("zeta")) 643 s.append((char) 950); // greek small letter zeta, U+03B6 ISOgrk3 644 else if (c.equals("eta")) 645 s.append((char) 951); // greek small letter eta, U+03B7 ISOgrk3 646 else if (c.equals("theta")) 647 s.append((char) 952); // greek small letter theta, U+03B8 ISOgrk3 648 else if (c.equals("iota")) 649 s.append((char) 953); // greek small letter iota, U+03B9 ISOgrk3 650 else if (c.equals("kappa")) 651 s.append((char) 954); // greek small letter kappa, U+03BA ISOgrk3 652 else if (c.equals("lambda")) 653 s.append((char) 955); // greek small letter lambda, U+03BB ISOgrk3 654 else if (c.equals("mu")) 655 s.append((char) 956); // greek small letter mu, U+03BC ISOgrk3 656 else if (c.equals("nu")) 657 s.append((char) 957); // greek small letter nu, U+03BD ISOgrk3 658 else if (c.equals("xi")) 659 s.append((char) 958); // greek small letter xi, U+03BE ISOgrk3 660 else if (c.equals("omicron")) 661 s.append((char) 959); // greek small letter omicron, U+03BF NEW 662 else if (c.equals("pi")) 663 s.append((char) 960); // greek small letter pi, U+03C0 ISOgrk3 664 else if (c.equals("rho")) 665 s.append((char) 961); // greek small letter rho, U+03C1 ISOgrk3 666 else if (c.equals("sigmaf")) 667 s.append((char) 962); // greek small letter final sigma, U+03C2 ISOgrk3 668 else if (c.equals("sigma")) 669 s.append((char) 963); // greek small letter sigma, U+03C3 ISOgrk3 670 else if (c.equals("tau")) 671 s.append((char) 964); // greek small letter tau, U+03C4 ISOgrk3 672 else if (c.equals("upsilon")) 673 s.append((char) 965); // greek small letter upsilon, U+03C5 ISOgrk3 674 else if (c.equals("phi")) 675 s.append((char) 966); // greek small letter phi, U+03C6 ISOgrk3 676 else if (c.equals("chi")) 677 s.append((char) 967); // greek small letter chi, U+03C7 ISOgrk3 678 else if (c.equals("psi")) 679 s.append((char) 968); // greek small letter psi, U+03C8 ISOgrk3 680 else if (c.equals("omega")) 681 s.append((char) 969); // greek small letter omega, U+03C9 ISOgrk3 682 else if (c.equals("thetasym")) 683 s.append((char) 977); // greek small letter theta symbol, U+03D1 NEW 684 else if (c.equals("upsih")) 685 s.append((char) 978); // greek upsilon with hook symbol, U+03D2 NEW 686 else if (c.equals("piv")) 687 s.append((char) 982); // greek pi symbol, U+03D6 ISOgrk3 688 else if (c.equals("bull")) 689 s.append((char) 8226); // bullet = black small circle, U+2022 ISOpub 690 else if (c.equals("hellip")) 691 s.append((char) 8230); // horizontal ellipsis = three dot leader, U+2026 ISOpub 692 else if (c.equals("prime")) 693 s.append((char) 8242); // prime = minutes = feet, U+2032 ISOtech 694 else if (c.equals("Prime")) 695 s.append((char) 8243); // double prime = seconds = inches, U+2033 ISOtech 696 else if (c.equals("oline")) 697 s.append((char) 8254); // overline = spacing overscore, U+203E NEW 698 else if (c.equals("frasl")) 699 s.append((char) 8260); // fraction slash, U+2044 NEW 700 else if (c.equals("weierp")) 701 s.append((char) 8472); // script capital P = power set = Weierstrass p, U+2118 ISOamso 702 else if (c.equals("image")) 703 s.append((char) 8465); // blackletter capital I = imaginary part, U+2111 ISOamso 704 else if (c.equals("real")) 705 s.append((char) 8476); // blackletter capital R = real part symbol, U+211C ISOamso 706 else if (c.equals("trade")) 707 s.append((char) 8482); // trade mark sign, U+2122 ISOnum 708 else if (c.equals("alefsym")) 709 s.append((char) 8501); // alef symbol = first transfinite cardinal, U+2135 NEW 710 else if (c.equals("larr")) 711 s.append((char) 8592); // leftwards arrow, U+2190 ISOnum 712 else if (c.equals("uarr")) 713 s.append((char) 8593); // upwards arrow, U+2191 ISOnum 714 else if (c.equals("rarr")) 715 s.append((char) 8594); // rightwards arrow, U+2192 ISOnum 716 else if (c.equals("darr")) 717 s.append((char) 8595); // downwards arrow, U+2193 ISOnum 718 else if (c.equals("harr")) 719 s.append((char) 8596); // left right arrow, U+2194 ISOamsa 720 else if (c.equals("crarr")) 721 s.append((char) 8629); // downwards arrow with corner leftwards = carriage return, U+21B5 NEW 722 else if (c.equals("lArr")) 723 s.append((char) 8656); // leftwards double arrow, U+21D0 ISOtech 724 else if (c.equals("uArr")) 725 s.append((char) 8657); // upwards double arrow, U+21D1 ISOamsa 726 else if (c.equals("rArr")) 727 s.append((char) 8658); // rightwards double arrow, U+21D2 ISOtech 728 else if (c.equals("dArr")) 729 s.append((char) 8659); // downwards double arrow, U+21D3 ISOamsa 730 else if (c.equals("hArr")) 731 s.append((char) 8660); // left right double arrow, U+21D4 ISOamsa 732 else if (c.equals("forall")) 733 s.append((char) 8704); // for all, U+2200 ISOtech 734 else if (c.equals("part")) 735 s.append((char) 8706); // partial differential, U+2202 ISOtech 736 else if (c.equals("exist")) 737 s.append((char) 8707); // there exists, U+2203 ISOtech 738 else if (c.equals("empty")) 739 s.append((char) 8709); // empty set = null set = diameter, U+2205 ISOamso 740 else if (c.equals("nabla")) 741 s.append((char) 8711); // nabla = backward difference, U+2207 ISOtech 742 else if (c.equals("isin")) 743 s.append((char) 8712); // element of, U+2208 ISOtech 744 else if (c.equals("notin")) 745 s.append((char) 8713); // not an element of, U+2209 ISOtech 746 else if (c.equals("ni")) 747 s.append((char) 8715); // contains as member, U+220B ISOtech 748 else if (c.equals("prod")) 749 s.append((char) 8719); // n-ary product = product sign, U+220F ISOamsb 750 else if (c.equals("sum")) 751 s.append((char) 8721); // n-ary sumation, U+2211 ISOamsb 752 else if (c.equals("minus")) 753 s.append((char) 8722); // minus sign, U+2212 ISOtech 754 else if (c.equals("lowast")) 755 s.append((char) 8727); // asterisk operator, U+2217 ISOtech 756 else if (c.equals("radic")) 757 s.append((char) 8730); // square root = radical sign, U+221A ISOtech 758 else if (c.equals("prop")) 759 s.append((char) 8733); // proportional to, U+221D ISOtech 760 else if (c.equals("infin")) 761 s.append((char) 8734); // infinity, U+221E ISOtech --> 762 else if (c.equals("ang")) 763 s.append((char) 8736); // angle, U+2220 ISOamso 764 else if (c.equals("and")) 765 s.append((char) 8743); // logical and = wedge, U+2227 ISOtech 766 else if (c.equals("or")) 767 s.append((char) 8744); // logical or = vee, U+2228 ISOtech 768 else if (c.equals("cap")) 769 s.append((char) 8745); // intersection = cap, U+2229 ISOtech 770 else if (c.equals("cup")) 771 s.append((char) 8746); // union = cup, U+222A ISOtech 772 else if (c.equals("int")) 773 s.append((char) 8747); // integral, U+222B ISOtech 774 else if (c.equals("there4")) 775 s.append((char) 8756); // therefore, U+2234 ISOtech 776 else if (c.equals("sim")) 777 s.append((char) 8764); // tilde operator = varies with = similar t U+223C ISOtech 778 else if (c.equals("cong")) 779 s.append((char) 8773); // approximately equal to, U+2245 ISOtec 780 else if (c.equals("asymp")) 781 s.append((char) 8776); // almost equal to = asymptotic to, U+2248 ISOamsr 782 else if (c.equals("ne")) 783 s.append((char) 8800); // not equal to, U+2260 ISOtech 784 else if (c.equals("equiv")) 785 s.append((char) 8801); // identical to, U+2261 ISOtech 786 else if (c.equals("le")) 787 s.append((char) 8804); // less-than or equal to, U+2264 ISOtech 788 else if (c.equals("ge")) 789 s.append((char) 8805); // greater-than or equal to, U+2265 ISOtech 790 else if (c.equals("sub")) 791 s.append((char) 8834); // subset of, U+2282 ISOtech 792 else if (c.equals("sup")) 793 s.append((char) 8835); // superset of, U+2283 ISOtech 794 else if (c.equals("nsub")) 795 s.append((char) 8836); // not a subset of, U+2284 ISOamsn 796 else if (c.equals("sube")) 797 s.append((char) 8838); // subset of or equal to, U+2286 ISOtech 798 else if (c.equals("supe")) 799 s.append((char) 8839); // superset of or equal to, U+2287 ISOtech 800 else if (c.equals("oplus")) 801 s.append((char) 8853); // circled plus = direct sum, U+2295 ISOamsb 802 else if (c.equals("otimes")) 803 s.append((char) 8855); // circled times = vector product, U+2297 ISOamsb --> 804 else if (c.equals("perp")) 805 s.append((char) 8869); // up tack = orthogonal to = perpendicular, U+22A5 ISOtech 806 else if (c.equals("sdot")) 807 s.append((char) 8901); // dot operator, U+22C5 ISOamsb 808 else if (c.equals("lceil")) 809 s.append((char) 8968); // left ceiling = apl upstile, U+2308 ISOamsc 810 else if (c.equals("rceil")) 811 s.append((char) 8969); // right ceiling, U+2309 ISOamsc 812 else if (c.equals("lfloor")) 813 s.append((char) 8970); // left floor = apl downstile, U+230A ISOamsc 814 else if (c.equals("rfloor")) 815 s.append((char) 8971); // right floor, U+230B ISOamsc 816 else if (c.equals("lang")) 817 s.append((char) 9001); // left-pointing angle bracket = bra, U+2329 ISOtech 818 else if (c.equals("rang")) 819 s.append((char) 9002); // right-pointing angle bracket = ket, U+232A ISOtech 820 else if (c.equals("loz")) 821 s.append((char) 9674); // lozenge, U+25CA ISOpub 822 else if (c.equals("spades")) 823 s.append((char) 9824); // black spade suit, U+2660 ISOpub 824 else if (c.equals("clubs")) 825 s.append((char) 9827); // black club suit = shamrock, U+2663 ISOpub 826 else if (c.equals("hearts")) 827 s.append((char) 9829); // black heart suit = valentine, U+2665 ISOpub 828 else if (c.equals("diams")) 829 s.append((char) 9830); // black diamond suit, U+2666 ISOpub -- 830 else 831 throw new FHIRFormatError("unable to parse character reference '" + c + "'' (last text = '" + lastText + "'" + descLoc()); 832 } 833 834 private XhtmlNode parseNode(Element node, String defaultNS) throws FHIRFormatError { 835 XhtmlNode res = new XhtmlNode(NodeType.Element); 836 res.setName(node.getLocalName()); 837 defaultNS = checkNS(res, node, defaultNS); 838 for (int i = 0; i < node.getAttributes().getLength(); i++) { 839 Attr attr = (Attr) node.getAttributes().item(i); 840 if (attributeIsOk(res.getName(), attr.getName(), attr.getValue()) && !attr.getLocalName().startsWith("xmlns")) 841 res.getAttributes().put(attr.getName(), attr.getValue()); 842 } 843 Node child = node.getFirstChild(); 844 while (child != null) { 845 if (child.getNodeType() == Node.TEXT_NODE) { 846 res.addText(child.getTextContent()); 847 } else if (child.getNodeType() == Node.COMMENT_NODE) { 848 res.addComment(child.getTextContent()); 849 } else if (child.getNodeType() == Node.ELEMENT_NODE) { 850 if (elementIsOk(child.getLocalName())) 851 res.getChildNodes().add(parseNode((Element) child, defaultNS)); 852 } else 853 throw new FHIRFormatError("Unhandled XHTML feature: " + Integer.toString(child.getNodeType()) + descLoc()); 854 child = child.getNextSibling(); 855 } 856 return res; 857 } 858 859 private XhtmlNode parseNode(XMLEventReader xpp) throws IOException, FHIRFormatError, XMLStreamException { 860 XhtmlNode res = new XhtmlNode(NodeType.Element); 861 862 if (!xpp.hasNext()) { 863 return res; 864 } 865 866 StartElement firstEvent = (StartElement) xpp.nextEvent(); 867 res.setName(firstEvent.getSchemaType().getLocalPart()); 868 869 for (Iterator<?> attrIter = firstEvent.getAttributes(); attrIter.hasNext();) { 870 Attribute nextAttr = (Attribute) attrIter.next(); 871 if (attributeIsOk(firstEvent.getName().getLocalPart(), nextAttr.getName().getLocalPart(), nextAttr.getValue())) 872 res.getAttributes().put(nextAttr.getName().getLocalPart(), nextAttr.getValue()); 873 } 874 875 while (xpp.hasNext()) { 876 XMLEvent nextEvent = xpp.nextEvent(); 877 int eventType = nextEvent.getEventType(); 878 if (eventType != XMLEvent.END_ELEMENT) { 879 break; 880 } 881 if (eventType == XMLEvent.CHARACTERS) { 882 res.addText(((Characters) xpp).getData()); 883 } else if (eventType == XMLEvent.COMMENT) { 884 res.addComment(((Comment) xpp).getText()); 885 } else if (eventType == XMLEvent.START_ELEMENT) { 886 StartElement nextStart = (StartElement) nextEvent; 887 if (elementIsOk(nextStart.getName().getLocalPart())) { 888 res.getChildNodes().add(parseNode(xpp)); 889 } 890 } else { 891 throw new FHIRFormatError("Unhandled XHTML feature: " + Integer.toString(eventType) + descLoc()); 892 } 893 } 894 xpp.next(); 895 return res; 896 } 897 898 private char peekChar() throws IOException { 899 if (cache.length() > 0) 900 return cache.charAt(0); 901 else if (!rdr.ready()) 902 return '\0'; 903 else { 904 char c = (char) rdr.read(); 905 if (c == (char) -1) { 906 cache = ""; 907 return '\0'; 908 } 909 cache = Character.toString(c); 910 return c; 911 } 912 } 913 914 private void pushChar(char ch) { 915 cache = Character.toString(ch) + cache; 916 } 917 918 private char readChar() throws IOException { 919 char c; 920 if (cache.length() > 0) { 921 c = cache.charAt(0); 922 cache = cache.length() == 1 ? "" : cache.substring(1); 923 } else if (!rdr.ready()) 924 c = '\0'; 925 else 926 c = (char) rdr.read(); 927 if (c == '\r' || c == '\n') { 928 if (c == '\r' || lastChar != '\r') { 929 line++; 930 col = 0; 931 } 932 lastChar = c; 933 } 934 col++; 935 return c; 936 } 937 938 private String readName() throws IOException { 939 StringBuilder s = new StringBuilder(); 940 while (isNameChar(peekChar())) 941 s.append(readChar()); 942 return s.toString(); 943 } 944 945 private String readToCommentEnd() throws IOException, FHIRFormatError { 946 if (peekChar() == '!') 947 readChar(); 948 StringBuilder s = new StringBuilder(); 949 950 boolean simple = true; 951 if (peekChar() == '-') { 952 readChar(); 953 simple = peekChar() != '-'; 954 if (simple) 955 s.append('-'); 956 else 957 readChar(); 958 } 959 960 boolean done = false; 961 while (!done) { 962 char c = peekChar(); 963 if (c == '-') { 964 readChar(); 965 if (peekChar() == '-') { 966 readChar(); 967 if (peekChar() == '>') { 968 done = true; 969 } else 970 s.append("--"); 971 } else 972 s.append('-'); 973 } else if (simple && peekChar() == '>') { 974 done = true; 975 } else if (c != '\0') 976 s.append(readChar()); 977 else if (mustBeWellFormed) 978 throw new FHIRFormatError("Unexpected termination of html source" + descLoc()); 979 } 980 if (peekChar() != '\0') { 981 readChar(); 982 skipWhiteSpace(); 983 } 984 return s.toString(); 985 } 986 987 private String readToTagEnd() throws IOException, FHIRFormatError { 988 StringBuilder s = new StringBuilder(); 989 while (peekChar() != '>' && peekChar() != '\0') 990 s.append(readChar()); 991 if (peekChar() != '\0') { 992 readChar(); 993 skipWhiteSpace(); 994 } else if (mustBeWellFormed) 995 throw new FHIRFormatError("Unexpected termination of html source" + descLoc()); 996 return s.toString(); 997 } 998 999 private String readUntil(char ch) throws IOException { 1000 StringBuilder s = new StringBuilder(); 1001 while (peekChar() != 0 && peekChar() != ch) 1002 s.append(readChar()); 1003 readChar(); 1004 return s.toString(); 1005 } 1006 1007 public void setMustBeWellFormed(boolean mustBeWellFormed) { 1008 this.mustBeWellFormed = mustBeWellFormed; 1009 } 1010 1011 public void setPolicy(ParserSecurityPolicy policy) { 1012 this.policy = policy; 1013 } 1014 1015 public void setTrimWhitespace(boolean trimWhitespace) { 1016 this.trimWhitespace = trimWhitespace; 1017 } 1018 1019 public XhtmlParser setValidatorMode(boolean validatorMode) { 1020 this.validatorMode = validatorMode; 1021 return this; 1022 } 1023 1024 private void skipWhiteSpace() throws IOException { 1025 if (trimWhitespace) 1026 while (Character.isWhitespace(peekChar()) || (peekChar() == 0xfeff)) 1027 readChar(); 1028 } 1029 1030 private void skipWhiteSpaceAndComments(XhtmlNode focus) throws IOException, FHIRFormatError { 1031 while (Character.isWhitespace(peekChar()) || (peekChar() == 0xfeff)) 1032 readChar(); 1033 if (peekChar() == '<') { 1034 char ch = readChar(); 1035 if (peekChar() == '!') { 1036 readChar(); 1037 if (peekChar() == '-') { 1038 readChar(); 1039 if (peekChar() == '-') { 1040 readChar(); 1041 if (peekChar() == ' ') 1042 readChar(); 1043 focus.addComment(readToCommentEnd()); 1044 } else 1045 throw new FHIRFormatError("unrecognised element type <!" + peekChar() + descLoc()); 1046 } else 1047 focus.addDocType(readToCommentEnd()); 1048 skipWhiteSpaceAndComments(focus); 1049 } else if (peekChar() == '?') { 1050 String r = readToTagEnd(); 1051 focus.addInstruction(r.substring(1, r.length() - 1)); 1052 skipWhiteSpaceAndComments(focus); 1053 } else 1054 pushChar(ch); 1055 } 1056 } 1057 1058 public class NSMap { 1059 private Map<String, String> nslist = new HashMap<String, String>(); 1060 1061 public NSMap(NSMap nsm) { 1062 if (nsm != null) 1063 nslist.putAll(nsm.nslist); 1064 } 1065 1066 public String def() { 1067 return nslist.get(""); 1068 } 1069 1070 public void def(String ns) { 1071 nslist.put("", ns); 1072 } 1073 1074 public String get(String abbrev) { 1075 return nslist.containsKey(abbrev) ? nslist.get(abbrev) : "http://error/undefined-namespace"; 1076 } 1077 1078 public boolean hasDef() { 1079 return nslist.containsKey(""); 1080 } 1081 1082 public void ns(String abbrev, String ns) { 1083 nslist.put(abbrev, ns); 1084 } 1085 } 1086 1087 public enum ParserSecurityPolicy { 1088 Accept, Drop, Reject 1089 } 1090 1091 public class QName { 1092 private String name; 1093 private String ns; 1094 1095 public QName(String src) { 1096 if (src.contains(":")) { 1097 ns = src.substring(0, src.indexOf(":")); 1098 name = src.substring(src.indexOf(":") + 1); 1099 } else { 1100 ns = null; 1101 name = src; 1102 } 1103 } 1104 1105 public String getName() { 1106 return name; 1107 } 1108 1109 public String getNs() { 1110 return ns; 1111 } 1112 1113 public boolean hasNs() { 1114 return ns != null; 1115 } 1116 1117 } 1118 1119}