001package ca.uhn.hl7v2.preparser; 002 003import java.io.IOException; 004import java.util.ArrayList; 005import java.util.Collection; 006import java.util.Iterator; 007import java.util.List; 008import java.util.Map; 009import java.util.Properties; 010import java.util.SortedMap; 011import java.util.TreeMap; 012 013import javax.xml.parsers.ParserConfigurationException; 014import javax.xml.parsers.SAXParser; 015import javax.xml.parsers.SAXParserFactory; 016 017import org.xml.sax.Attributes; 018import org.xml.sax.InputSource; 019import org.xml.sax.SAXException; 020import org.xml.sax.SAXParseException; 021import org.xml.sax.helpers.DefaultHandler; 022 023import ca.uhn.hl7v2.HL7Exception; 024 025public class XML 026{ 027 @SuppressWarnings("serial") 028 protected static class StopParsingException extends SAXException 029 { 030 public StopParsingException() 031 { 032 super("ca.uhn.hl7.....StopParsingException"); 033 } 034 } 035 036 /** the SAXParser reports parsing events to an object of this class. 037 We keep track of some parsing state, and the Properties object that 038 we're supposed to write our data to. 039 */ 040 static protected class HL7MessageHandler extends DefaultHandler 041 { 042 /* m_props & m_msgMask should be set by the user of this handler before 043 they pass this handler to SAXParser.parse() or whatever */ 044 045 /** The data that is found while parsing, and which passes m_msgMask, 046 will be dumped to m_props, as (DatumPath.toString() / text) key/value 047 pairs */ 048 public Properties m_props = null; 049 050 /** Specifies what parts of a message should be dumped to m_props. 051 */ 052 public Collection<DatumPath> m_msgMask = null; 053 054 /* All other fields are parser state. */ 055 056 protected boolean m_startedDocument = false; 057 058 /* m_msgID / m_curPath together keep track of where we are in the document. 059 060 If m_msgID.length() != 0, then we're within the message element. (We're only 061 expecting one message per document.) Then m_msgID will be the name of the 062 message. ("ACK" or whatever). 063 064 m_curPath keeps track of where within the message we are. See notes at 065 DatumPath class definition. If m_curPath.size() != 0, then we must be 066 within a message. 067 068 At any point in the code below: 069 070 if m_msgID.length() == 0, 071 then m_curPath().size() == 0 072 073 if m_curPath.length() != 0 074 then m_msgID.length() != 0 075 076 Note that our DatumPaths count indices starting from 0 (not 1) -- they're 077 only converted to 1-based in the string representations that wind up 078 as m_props keys. 079 */ 080 StringBuffer m_msgID = new StringBuffer(); 081 DatumPath m_curPath = new DatumPath(); 082 083 /* the location in the document of the last datum we dumped to m_props. */ 084 DatumPath m_lastDumpedPath = new DatumPath(); 085 086 /** For handling repeat segments. segmentID (String) -> next repeat idx 087 (Integer). So when we hit a segment ZYX, we'll know how many times we've 088 hit a ZYX before, and set the segmentRepIdx part of m_curPath 089 appropriately. */ 090 SortedMap<String, Integer> m_segmentId2nextRepIdx = new TreeMap<String, Integer>(); 091 092 /* m_depthWithinUselessElement and m_depthWithinUsefulElement 093 reflect what m_msgMask thinks about our location in the document at any 094 given time. 095 096 Both should always be >= -1. Note that both can be >= 0 at the same time 097 -- explained in a minute.... 098 099 If m_depthWithinUsefulElement >= 0, this means that we are however deep 100 (in terms of nested elements: 0 => just within) within an area of the 101 message that passes m_msgMask. We should should dump whatever we find 102 there to m_props. As we move around within such an element, we will still 103 update m_curPath appropriately. 104 105 If m_depthWithinUsefulElement >= 0, we are however deep within an element 106 which either made no sense (eg. <ZZZ.1> where we were expecting a <ZYX.1> 107 -- a few other things maybe), or more importantly that we're within an 108 element that otherwise has no hope of having any useful elements within it 109 according to m_msgMask. (eg. m_msgMask says it wants only ZYX segment 110 contents, we're in an <MSH>). So we can safely ignore all content within, 111 and just keep track of how deep we are within this useless element (with 112 m_depthWithinUselessElement, of course.) We don't update m_curPath when 113 m_depthWithinUselessElement >= 0, there's no point and how would we 114 extract information for the DatumPath out of nonsensical element names 115 anyway. 116 117 If they are both >= 0, this means that there we've found some useless 118 stuff (nonsensical element names?) within a known-useful element. 119 */ 120 int m_depthWithinUsefulElement = -1, m_depthWithinUselessElement = -1; 121 122 /* With this we keep the text that we've found within a certain element. 123 It's cleared whenever we enter a (sub) element or leave an element. */ 124 StringBuffer m_chars = new StringBuffer(10); 125 126 public HL7MessageHandler() 127 { 128 this.clear(); 129 } 130 131 void clear() 132 { 133 // reset the state (m_props & m_msgMask are not state) 134 m_startedDocument = false; 135 m_msgID.delete(0, m_msgID.length()); 136 m_curPath.clear(); 137 // will always be "less than" (according to DatumPath.numbersLessThan) 138 // any sensible DatumPath: 139 m_lastDumpedPath.clear().add(new String()).add(-42).add(-42).add(-42).add(-42).add(-42); 140 m_segmentId2nextRepIdx.clear(); 141 m_depthWithinUsefulElement = -1; 142 m_depthWithinUselessElement = -1; 143 m_chars.delete(0, m_chars.length()); 144 } 145 146 public void startDocument() throws SAXException 147 { 148 boolean ok = false; 149 if(!m_startedDocument && (m_props != null)) { 150 m_startedDocument = true; 151 ok = true; 152 } 153 154 if(!ok) { 155 clear(); 156 throw new StopParsingException(); 157 } 158 } 159 160 public void endDocument() throws SAXException 161 { 162 boolean ok = false; 163 if(m_startedDocument) { 164 this.clear(); 165 ok = true; 166 } 167 168 if(!ok) { 169 clear(); 170 throw new StopParsingException(); 171 } 172 } 173 174 public void startElement(String uri, String localName, String qName, 175 Attributes attributes) throws SAXException 176 { 177 //System.err.println("startelem: " + qName + " curpathsize; " + 178 //m_curPath.size()); 179 boolean ok = false; 180 if(m_startedDocument) { 181 // A single unit of text data will be within a single element, 182 // -- none of it will be in sub-elements and there will be no 183 // sub-elements fragmenting the data text. 184 // Right now we're entering a new element: this means that anything 185 // in m_chars will be whitespace (likely), or text left over from, 186 // say, the last field, or text that was somewhere it shouldn't have been. 187 // (ex. "<ZYX.9> shouldn't be here <PT.1> P </PT.1> </ZYX.9>" 188 m_chars.delete(0, m_chars.length()); 189 190 if(m_depthWithinUselessElement >= 0) { 191 ++m_depthWithinUselessElement; 192 } 193 else { 194 int oldCurPathSize = m_curPath.size(); 195 if(tryToGrowDocLocationFromElementName(m_msgID, m_curPath, 196 m_segmentId2nextRepIdx, m_lastDumpedPath, qName)) 197 { 198 if(m_curPath.size() > oldCurPathSize) { 199 // assert (m_depthWithinUselessElement == -1) // m_curPath 200 // should not have grown if we're within a useless element. 201 if(m_depthWithinUsefulElement == -1) { 202 // this new element could match one of the DatumPaths in 203 // m_msgMask -- if that's the case, we've just entered a 204 // useful element. 205 // TODO: functional stylee (a la C++'s std::accumulate) ? 206 boolean curPathStartsWithAMaskElem = false; 207 for(Iterator<DatumPath> maskIt = m_msgMask.iterator(); 208 !curPathStartsWithAMaskElem && maskIt.hasNext(); ) 209 { 210 curPathStartsWithAMaskElem 211 = m_curPath.startsWith(maskIt.next()); 212 } 213 214 if(curPathStartsWithAMaskElem) 215 m_depthWithinUsefulElement = 0; 216 else { 217 // so this element we're entering is not specified by m_msgMask 218 // to be useful -- but might it contains elements that 219 // are? 220 boolean aMaskElemStartsWithCurPath = false; 221 for(Iterator<DatumPath> maskIt = m_msgMask.iterator(); 222 !aMaskElemStartsWithCurPath && maskIt.hasNext(); ) 223 { 224 aMaskElemStartsWithCurPath 225 = maskIt.next().startsWith(m_curPath); 226 } 227 228 if(!aMaskElemStartsWithCurPath) { 229 // ... nope! useless. 230 m_depthWithinUselessElement = 0; 231 m_curPath.setSize(oldCurPathSize); 232 } // else => ok, carry on, m_depthWithinUse{less,ful}Element 233 // still both -1. 234 } 235 } 236 // else => already within a useful element, don't need to compare 237 // against m_msgMask. 238 } 239 } 240 else 241 m_depthWithinUselessElement = 0; 242 } 243 ok = true; 244 } 245 246 if(!ok) { 247 clear(); 248 throw new StopParsingException(); 249 } 250 } 251 252 /* doc location == msgID & curPath together. 253 If we've encountered an element called "elementNam", then this tries 254 to determine what it is, based on what we already know about the document. 255 returns true if we can make sense of this new element name given the 256 position we're at (represented by msgID / curPath), 257 false if we can't (which probably means this should be a useless element). 258 returning true doesn't mean that we actually changed msgID or curPath, it 259 might mean that we just passed through a segment group element OK. 260 */ 261 protected static boolean tryToGrowDocLocationFromElementName( 262 StringBuffer msgID /*in/out*/, DatumPath curPath /*in/out*/, 263 Map<String, Integer> segmentId2nextRepIdx /*in/out*/, DatumPath lastDumpedPath /*in*/, 264 String elementName /*in*/) 265 { 266 boolean ok = false; // ok == can we make sense of this new element? 267 // hmm ... where are we in the document: 268 if((msgID.length() == 0) && (curPath.size() == 0)) { 269 // we're entering a message 270 msgID.replace(0, msgID.length(), elementName); 271 segmentId2nextRepIdx.clear(); 272 ok = true; 273 } 274 else if((msgID.length() > 0) && (curPath.size() == 0)) { 275 // we're entering either a segment-group element (eg. <ADT_A01.PROCEDURE>) 276 // or an actual segment element. 277 if(!(elementName.startsWith("" + msgID + '.'))) { 278 // must be an actual segment. 279 curPath.add(elementName); 280 281 if(segmentId2nextRepIdx.containsKey(elementName)) 282 curPath.add(segmentId2nextRepIdx.get(elementName)); 283 else 284 curPath.add(new Integer(0)); 285 286 segmentId2nextRepIdx.put(elementName, ((Integer)curPath.get(curPath.size()-1)).intValue() + 1); 287 } 288 ok = true; 289 } 290 else if((msgID.length() > 0) && (curPath.size() > 0)) { 291 // we're entering a field or a component or a subcomponent. 292 if(curPath.size() == 2) { // we're entering a field element 293 // all fields should start with segment-ID + '.' 294 if(elementName.startsWith("" + curPath.get(0) + '.')) { 295 try { 296 int fieldIdxFromElementName 297 = Integer.parseInt(elementName.substring(elementName.indexOf('.') + 1)); 298 299 curPath.add(new Integer(fieldIdxFromElementName)); 300 301 // now add the repetition idx to curPath: 302 if((lastDumpedPath.size() >= 4) 303 && (((Integer)lastDumpedPath.get(2)).intValue() 304 == fieldIdxFromElementName)) 305 { 306 // lastDumpedPath has a fieldIdx and a fieldRepIdx. 307 curPath.add(new Integer(((Integer)lastDumpedPath.get(3)).intValue() + 1)); 308 } 309 else 310 curPath.add(new Integer(0)); 311 312 ok = true; 313 } catch(NumberFormatException e) {} 314 } // else => this isn't a field -- must be useless. 315 } 316 else if((curPath.size() == 4) || (curPath.size() == 5)) { 317 // we're entering a component or subcomponent element 318 try { 319 int idxFromElementName 320 = Integer.parseInt(elementName.substring(elementName.indexOf('.') + 1)); 321 curPath.add(new Integer(idxFromElementName)); 322 ok = true; 323 } catch(NumberFormatException e) {} 324 } 325 } 326 return ok; 327 } 328 329 public void endElement(String uri, String localName, String qName) 330 throws SAXException 331 { 332 //System.err.println("endElement: " + qName); 333 boolean ok = false; 334 if(m_startedDocument) { 335 if(m_depthWithinUselessElement >= 0) { 336 --m_depthWithinUselessElement; 337 ok = true; 338 } 339 else { 340 if((m_msgID.length() > 0) && (m_curPath.size() == 0)) { 341 // we're exiting either a message element or a 342 // segment group element. 343 if((""+qName).compareTo(""+m_msgID) == 0) 344 m_msgID.delete(0, m_msgID.length()); // => exiting message element 345 // else => segment group element -- do nothing. 346 347 ok = true; 348 } 349 else if((m_msgID.length() > 0) && (m_curPath.size() > 0)) { 350 tryToDumpDataToProps(); 351 352 if(m_curPath.size() == 2) { 353 // exiting a segment element 354 m_curPath.setSize(0); 355 ok = true; 356 } 357 else if(m_curPath.size() == 4) { 358 // exiting a field element 359 m_curPath.setSize(2); 360 ok = true; 361 } 362 else if((m_curPath.size() == 5) || (m_curPath.size() == 6)) { 363 // exiting a component or a subcomponent 364 m_curPath.setSize(m_curPath.size() - 1); 365 ok = true; 366 } 367 } 368 369 if(m_depthWithinUsefulElement >= 0) 370 --m_depthWithinUsefulElement; 371 } 372 } 373 374 if(!ok) { 375 clear(); 376 throw new StopParsingException(); 377 } 378 } 379 380 /** try to dump whatever we've got in m_chars to m_props, 381 with a key of m_curPath.toString(). 382 */ 383 protected void tryToDumpDataToProps() 384 { 385 if((m_curPath.size() >= 2) && (m_depthWithinUselessElement == -1)) { 386 /* m_curPath.toString() will be the property key whose value will be 387 m_chars. 388 389 This is (part of) what m_lastDumpedPath is for: With, for example "<ZYX.9> 390 <PT.1>P</PT.1> </ZYX.9>" we might have had a m_curPath containing something 391 like [ZYX, 0, 9, 0, 0] when we exited the PT.1 element. (note: internal 392 DatumPath elements are 0-indexed, string representations of DatumPaths and 393 the XML text is 1-indexed.) So in m_props the key for "P" would have been 394 "ZYX[0]-9[0]-1-1". (the last "-1" is a default that got added by 395 toString()). 396 397 Then we would have exited the PT.3 element, changed m_curPath to [ZYX, 0, 398 9, 0], picked up the whitespace between </PT.3> and </ZYX.9>, and when 399 exiting the ZYX.9 element, we might have written that whitespace to m_props 400 with a key of the toString() of [ZYX, 0, 9, 0]; that is, "ZYX[0]-9[0]-1-1": 401 the same as the key for the "P" ... clobbering "P" in m_props with 402 whitespace. 403 404 But since we know that HL7 fields / components / etc are always in order 405 (numerically), we can count on m_lastDumpedPath and use 406 DatumPath.numbersLessThan to avoid the clobbering. 407 */ 408 if((m_lastDumpedPath.get(0).equals(m_curPath.get(0))) 409 ? (m_lastDumpedPath.numbersLessThan(m_curPath)) 410 : true) 411 { 412 if(m_depthWithinUsefulElement >= 0) { 413 // TODO: remove! or assert 414 if(m_props.containsKey("" + m_curPath)) 415 System.err.println("ALAAAARM: CLOBBERING PROPERTY in " + getClass()); 416 417 m_props.setProperty("" + m_curPath, "" + m_chars); 418 m_lastDumpedPath.copy(m_curPath); 419 m_chars.delete(0, m_chars.length()); 420 } 421 } 422 } 423 } 424 425 public void characters(char[] chars, int start, int length) 426 { 427 // note that a contiguous run of characters in the document 428 // might get reported to us in several chunks. 429 // (In the order that the text appears in the document, 430 // non-overlapping and with no gaps between chunks.) 431 // An entity like & will reach us as an actual & character. 432 433 if((m_msgID.length() > 0) && (m_curPath.size() >= 4)) { 434 m_chars.append(chars, start, length); 435 } 436 } 437 438 public void ignoreableWhitespace(char []chars, int start, int length) 439 { 440 // it's unclear which whitespace is considered ignorable for us. 441 // what the heck, add it to m_chars. 442 characters(chars, start, length); 443 } 444 445 public void error(SAXParseException e) 446 { 447 // TODO: remove. 448 System.err.println("Error in " + getClass() + ": " + e); 449 } 450 451 public void fatalError(SAXParseException e) throws SAXException 452 { 453 throw e; 454 } 455 } 456 457 /** parse message according to our HL7 XML handler, and dump the data found 458 to props. 459 460 returns true if we parsed ok, which means well-formed XML, and 461 that's about it. We just barely check against HL7 structure, and ignore any 462 elements / text that is unexpected (that is, impossible in any HL7 message: 463 independant of any message / segment definitions). 464 465 "message" should be an XML document with one top-level element -- that being 466 the message. (<ACK> or whatever). We're only expecting one message to be in 467 "message". 468 469 props can be null if you don't want the data (we still parse). The message 470 data found in message (that passes msgMask) will be added to props as key / 471 value pairs with the key a toString() of the appropriate DatumPath for the 472 location where the data is found (i.e. in the ZYX[a]-b[c]-d-e style), and 473 the value the corresponding text. So, after calling parseMessage 474 successfully, if you wanted to retrieve the message data from props you 475 might call something like 476 props.getProperty((new DatumPath()).add("MSH").add(1).toString()) 477 and that would return a String with "|", probably. 478 479 Note that this package facilitates the extraction of message data in a way 480 independent of message version (i.e. components and whatever getting added): 481 482 With a message of "<FOO><ZYX><ZYX.42>fieldy-field-field</ZYX.42></ZYX></FOO>", 483 "ZYX[0]-1[0]-1-1" will be the key that ends up in props (see notes at 484 DatumPath.toString()) 485 486 So if you, coding for a future version of the FOO message but 487 recieving old-version message data, tried 488 props.getProperty((new DatumPath()).add("ZYX").add(0).add(42).add(0).add(1).toString()) 489 with the message above (that is, trying to extract a repetition and 490 component that aren't there), you would get "ZYX[0]-42[0]-1-1" mapping to 491 "fieldy-field-field" in the resulting props. 492 493 If the message was 494 "<FOO><ZYX><ZYX.42><ARG.1>component data</ARG.1></ZYX.42></ZYX></FOO>" 495 and you, coding for an old version of this FOO message but recieving 496 new-version FOO message data, tried 497 props.getProperty((new DatumPath()).add("ZYX").add(0).add(42).toString()) 498 you would get "ZYX[0]-42[0]-1-1" mapping to "component data" in the resulting 499 props. 500 501 msgMask lets you specify which parts of the message you want dumped to props. 502 Passing in null gets you everything. Otherwise, msgMask's elements should 503 all be DatumPaths (! => ClassCastException), and a particular part of the 504 message will be dumped to props only if it's location, as represented by a 505 DatumPath, startsWith (as in DatumPath.startsWith()) at least one element of 506 msgMask. So if one element of msgMask was a (new DatumPath()).add(new 507 String("ZYX")), then everything in all ZYX segment would get dumped to props. 508 A (new DatumPath()).add(new String("ZYX")).add(1) would get only the first 509 repetitions of same (if there is one) dumped to props. etc. etc. Note that 510 a DatumPath of size() == 0 in msgMask will get you everything, no matter what 511 the other elements of msgMask are, because all DatumPaths startsWith the 512 zero-length DatumPath. 513 514 Segment group elements (eg. ADT_A01.PROCEDURE) are handled fine, but they 515 aren't addressed in msgMask or in the output in props -- basically any 516 element tags at the level immediately inside the message element, and having 517 a name that starts with the message element name + '.', is ignored (meaning 518 it's contents are dealt with the same as if the start and end tags' just 519 wasn't there.) 520 */ 521 public static boolean parseMessage(Properties props, String message, 522 Collection<DatumPath> msgMask) throws HL7Exception 523 { 524 boolean ret = false; 525 try { 526 SAXParserFactory factory = SAXParserFactory.newInstance(); 527 SAXParser parser = factory.newSAXParser(); 528 529 InputSource inSrc = new InputSource(new java.io.StringReader(message)); 530 531 HL7MessageHandler handler = new HL7MessageHandler(); 532 handler.m_props = (props != null 533 ? props : new Properties()); // it's expecting a props. 534 535 if(msgMask != null) 536 handler.m_msgMask = msgMask; 537 else { 538 handler.m_msgMask = new ArrayList<DatumPath>(); 539 handler.m_msgMask.add(new DatumPath()); 540 } 541 542 parser.parse(inSrc, handler); 543 ret = true; 544 } catch (ParserConfigurationException e) { 545 throw new HL7Exception(e); 546 } catch (IOException e) { 547 throw new HL7Exception(e); 548 } catch (StopParsingException e) { 549 throw new HL7Exception(e); 550 } catch (SAXException e) { 551 throw new HL7Exception(e); 552 } 553 554 return ret; 555 } 556 557 public static void main(String args[]) 558 { 559 if(args.length >= 1) { 560 Properties props = new Properties(); 561 List<DatumPath> msgMask = new ArrayList<DatumPath>(); 562 msgMask.add(new DatumPath().add("MSH").add(0).add(9)); 563 //msgMask.add(new DatumPath()); 564 boolean parseret; 565 try { 566 parseret = XML.parseMessage(props, args[0], msgMask); 567 System.err.println("parseMessage returned " + parseret); 568 } catch (HL7Exception e) { 569 e.printStackTrace(); 570 } 571 props.list(System.err); 572 } 573 } 574} 575