package com.atlassian.confluence.extra.flyingpdf.html;

import com.atlassian.confluence.importexport.ImportExportException;
import org.apache.xerces.parsers.DOMParser;
import org.apache.xerces.xni.parser.XMLDocumentFilter;
import org.cyberneko.html.HTMLConfiguration;
import org.xml.sax.SAXNotRecognizedException;
import org.xml.sax.SAXNotSupportedException;

public class HtmlConverterUtils {

    public static final String STYLECOLLECTOR_KEY = "http://atlassian.com/html/properties/stylecollector";
    public static final String LINKFIXER_KEY = "http://atlassian.com/html/properties/linkfixer";

    /**
     * @return a new instance of a DOMParser suitable for converting a full HTML document to a XHTML document.
     * @throws ImportExportException if there is a problem constructing the parser.
     */
    public static HtmlToDomParser getHtmlToXhtmlParser(LinkFixer linkFixer) throws ImportExportException {
        HTMLConfiguration config = new HTMLConfiguration();
        config.addRecognizedProperties(new String[]{STYLECOLLECTOR_KEY, LINKFIXER_KEY});
        DOMParser parser = new DOMParser(config);

        StringBuffer styleCollector = new StringBuffer();
        try {
            parser.setProperty("http://cyberneko.org/html/properties/filters", new XMLDocumentFilter[]{new ConfluenceHtmlToXmlFilter()});
            parser.setFeature("http://cyberneko.org/html/features/override-doctype", true);
            parser.setProperty("http://cyberneko.org/html/properties/doctype/pubid", "-//W3C//DTD XHTML 1.0 Transitional//EN");
            parser.setProperty("http://cyberneko.org/html/properties/doctype/sysid", "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd");
            parser.setProperty("http://cyberneko.org/html/properties/names/elems", "lower");
            parser.setProperty("http://cyberneko.org/html/properties/default-encoding", "UTF-8");
            parser.setProperty(STYLECOLLECTOR_KEY, styleCollector);
            parser.setProperty(LINKFIXER_KEY, linkFixer);

        } catch (SAXNotRecognizedException ex) {
            throw new ImportExportException(ex);
        } catch (SAXNotSupportedException ex) {
            throw new ImportExportException(ex);
        }

        return new HtmlToDomParser(parser);
    }
}
