/*
 * Decompiled with CFR 0.152.
 */
package gate.corpora;

import gate.Document;
import gate.Resource;
import gate.TextualDocument;
import gate.corpora.DocumentImpl;
import gate.corpora.MimeType;
import gate.corpora.RepositioningInfo;
import gate.corpora.TextualDocumentFormat;
import gate.creole.ResourceInstantiationException;
import gate.creole.metadata.AutoInstance;
import gate.creole.metadata.CreoleParameter;
import gate.creole.metadata.CreoleResource;
import gate.event.StatusListener;
import gate.html.NekoHtmlDocumentHandler;
import gate.util.DocumentFormatException;
import gate.util.Out;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.StringReader;
import java.net.URLConnection;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.zip.GZIPInputStream;
import org.apache.xerces.xni.XMLDocumentHandler;
import org.apache.xerces.xni.parser.XMLErrorHandler;
import org.apache.xerces.xni.parser.XMLInputSource;
import org.cyberneko.html.HTMLConfiguration;

@CreoleResource(name="GATE HTML Document Format", isPrivate=true, autoinstances={@AutoInstance(hidden=true)})
public class NekoHtmlDocumentFormat
extends TextualDocumentFormat {
    private static final long serialVersionUID = -3163147687966075651L;
    private static final boolean DEBUG = false;
    private Set<String> ignorableTags = null;
    private static Pattern afterNewlinePattern = Pattern.compile("^", 8);

    @CreoleParameter(comment="HTML tags whose text content should be ignored", defaultValue="script;style")
    public void setIgnorableTags(Set<String> newTags) {
        this.ignorableTags = newTags;
    }

    public Set<String> getIgnorableTags() {
        return this.ignorableTags;
    }

    @Override
    public Boolean supportsRepositioning() {
        return Boolean.TRUE;
    }

    @Override
    public void unpackMarkup(Document doc) throws DocumentFormatException {
        this.unpackMarkup(doc, null, null);
    }

    /*
     * WARNING - Removed try catching itself - possible behaviour change.
     */
    @Override
    public void unpackMarkup(Document doc, RepositioningInfo repInfo, RepositioningInfo ampCodingInfo) throws DocumentFormatException {
        if (doc == null || doc.getSourceUrl() == null && doc.getContent() == null) {
            throw new DocumentFormatException("GATE document is null or no content found. Nothing to parse!");
        }
        StatusListener statusListener = new StatusListener(){

            @Override
            public void statusChanged(String text) {
                NekoHtmlDocumentFormat.this.fireStatusChanged(text);
            }
        };
        boolean docHasContentButNoValidURL = NekoHtmlDocumentFormat.hasContentButNoValidUrl(doc);
        NekoHtmlDocumentHandler handler = null;
        try {
            XMLInputSource is;
            HTMLConfiguration parser = new HTMLConfiguration();
            parser.setProperty("http://cyberneko.org/html/properties/names/elems", (Object)"lower");
            parser.setProperty("http://cyberneko.org/html/properties/names/attrs", (Object)"lower");
            parser.setFeature("http://cyberneko.org/html/features/augmentations", true);
            handler = new NekoHtmlDocumentHandler(doc, null, this.ignorableTags);
            handler.addStatusListener(statusListener);
            handler.setRepositioningInfo(repInfo);
            handler.setAmpCodingInfo(ampCodingInfo);
            int[] lineOffsets = this.buildLineOffsets(doc.getContent().toString());
            handler.setLineOffsets(lineOffsets);
            parser.setDocumentHandler((XMLDocumentHandler)handler);
            parser.setErrorHandler((XMLErrorHandler)handler);
            if (docHasContentButNoValidURL) {
                is = new XMLInputSource(null, null, null, (Reader)new StringReader(doc.getContent().toString()), null);
            } else if (doc instanceof TextualDocument) {
                String docEncoding = ((TextualDocument)doc).getEncoding();
                URLConnection conn = doc.getSourceUrl().openConnection();
                InputStream uStream = conn.getInputStream();
                if ("gzip".equals(conn.getContentEncoding())) {
                    uStream = new GZIPInputStream(uStream);
                }
                InputStreamReader docReader = new InputStreamReader(uStream, docEncoding);
                is = new XMLInputSource(null, doc.getSourceUrl().toString(), doc.getSourceUrl().toString(), (Reader)docReader, docEncoding);
                parser.setFeature("http://cyberneko.org/html/features/scanner/ignore-specified-charset", true);
            } else {
                is = new XMLInputSource(null, doc.getSourceUrl().toString(), doc.getSourceUrl().toString());
            }
            parser.parse(is);
            ((DocumentImpl)doc).setNextAnnotationId(handler.getCustomObjectsId());
        }
        catch (IOException e) {
            throw new DocumentFormatException("I/O exception for " + doc.getSourceUrl().toString(), e);
        }
        catch (Exception e) {
            doc.getFeatures().put("parsingError", Boolean.TRUE);
            Boolean bThrow = (Boolean)doc.getFeatures().get("throwExceptionOnFormatError");
            if (bThrow != null && bThrow.booleanValue()) {
                throw new DocumentFormatException(e);
            }
            Out.println("Warning: Document remains unparsed. \n\n  Stack Dump: ");
            e.printStackTrace(Out.getPrintWriter());
        }
        finally {
            if (handler != null) {
                handler.removeStatusListener(statusListener);
            }
        }
    }

    private int[] buildLineOffsets(String docContent) {
        Matcher m = afterNewlinePattern.matcher(docContent);
        int numMatches = 0;
        while (m.find()) {
            ++numMatches;
        }
        int[] lineOffsets = new int[numMatches];
        m.reset();
        for (int i = 0; i < lineOffsets.length; ++i) {
            m.find();
            lineOffsets[i] = m.start();
        }
        return lineOffsets;
    }

    @Override
    public Resource init() throws ResourceInstantiationException {
        MimeType mime = new MimeType("text", "html");
        mimeString2ClassHandlerMap.put(mime.getType() + "/" + mime.getSubtype(), this);
        mimeString2mimeTypeMap.put(mime.getType() + "/" + mime.getSubtype(), mime);
        mimeString2mimeTypeMap.put("application/xhtml+xml", mime);
        suffixes2mimeTypeMap.put("html", mime);
        suffixes2mimeTypeMap.put("htm", mime);
        magic2mimeTypeMap.put("<html", mime);
        this.setMimeType(mime);
        return this;
    }
}

