/*
 * Decompiled with CFR 0.152.
 */
package edu.uci.ics.crawler4j.parser;

import edu.uci.ics.crawler4j.crawler.Configurable;
import edu.uci.ics.crawler4j.crawler.CrawlConfig;
import edu.uci.ics.crawler4j.crawler.Page;
import edu.uci.ics.crawler4j.crawler.exceptions.ParseException;
import edu.uci.ics.crawler4j.parser.BinaryParseData;
import edu.uci.ics.crawler4j.parser.ExtractedUrlAnchorPair;
import edu.uci.ics.crawler4j.parser.HtmlContentHandler;
import edu.uci.ics.crawler4j.parser.HtmlParseData;
import edu.uci.ics.crawler4j.parser.NotAllowedContentException;
import edu.uci.ics.crawler4j.parser.TextParseData;
import edu.uci.ics.crawler4j.url.URLCanonicalizer;
import edu.uci.ics.crawler4j.url.WebURL;
import edu.uci.ics.crawler4j.util.Net;
import edu.uci.ics.crawler4j.util.Util;
import java.io.ByteArrayInputStream;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
import java.util.HashSet;
import org.apache.tika.language.LanguageIdentifier;
import org.apache.tika.metadata.DublinCore;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.html.HtmlParser;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.ContentHandler;

public class Parser
extends Configurable {
    protected static final Logger logger = LoggerFactory.getLogger(Parser.class);
    private HtmlParser htmlParser = new HtmlParser();
    private ParseContext parseContext = new ParseContext();

    public Parser(CrawlConfig config) {
        super(config);
    }

    /*
     * Enabled force condition propagation
     * Lifted jumps to return sites
     */
    public void parse(Page page, String contextURL) throws NotAllowedContentException, ParseException {
        if (Util.hasBinaryContent(page.getContentType())) {
            BinaryParseData parseData = new BinaryParseData();
            if (!this.config.isIncludeBinaryContentInCrawling()) throw new NotAllowedContentException();
            parseData.setBinaryContent(page.getContentData());
            page.setParseData(parseData);
            if (parseData.getHtml() == null) {
                throw new ParseException();
            }
            parseData.setOutgoingUrls(Net.extractUrls(parseData.getHtml()));
            return;
        }
        if (Util.hasPlainTextContent(page.getContentType())) {
            try {
                TextParseData parseData = new TextParseData();
                if (page.getContentCharset() == null) {
                    parseData.setTextContent(new String(page.getContentData()));
                } else {
                    parseData.setTextContent(new String(page.getContentData(), page.getContentCharset()));
                }
                parseData.setOutgoingUrls(Net.extractUrls(parseData.getTextContent()));
                page.setParseData(parseData);
                return;
            }
            catch (Exception e) {
                logger.error("{}, while parsing: {}", (Object)e.getMessage(), (Object)page.getWebURL().getURL());
                throw new ParseException();
            }
        }
        Metadata metadata = new Metadata();
        HtmlContentHandler contentHandler = new HtmlContentHandler();
        try (ByteArrayInputStream inputStream = new ByteArrayInputStream(page.getContentData());){
            this.htmlParser.parse((InputStream)inputStream, (ContentHandler)contentHandler, metadata, this.parseContext);
        }
        catch (Exception e) {
            logger.error("{}, while parsing: {}", (Object)e.getMessage(), (Object)page.getWebURL().getURL());
            throw new ParseException();
        }
        if (page.getContentCharset() == null) {
            page.setContentCharset(metadata.get("Content-Encoding"));
        }
        HtmlParseData parseData = new HtmlParseData();
        parseData.setText(contentHandler.getBodyText().trim());
        parseData.setTitle(metadata.get(DublinCore.TITLE));
        parseData.setMetaTags(contentHandler.getMetaTags());
        LanguageIdentifier languageIdentifier = new LanguageIdentifier(parseData.getText());
        page.setLanguage(languageIdentifier.getLanguage());
        HashSet<WebURL> outgoingUrls = new HashSet<WebURL>();
        String baseURL = contentHandler.getBaseUrl();
        if (baseURL != null) {
            contextURL = baseURL;
        }
        int urlCount = 0;
        for (ExtractedUrlAnchorPair urlAnchorPair : contentHandler.getOutgoingUrls()) {
            String url;
            String hrefLoweredCase;
            String href = urlAnchorPair.getHref();
            if (href == null || href.trim().length() == 0 || (hrefLoweredCase = href.trim().toLowerCase()).contains("javascript:") || hrefLoweredCase.contains("mailto:") || hrefLoweredCase.contains("@") || (url = URLCanonicalizer.getCanonicalURL(href, contextURL)) == null) continue;
            WebURL webURL = new WebURL();
            webURL.setURL(url);
            webURL.setTag(urlAnchorPair.getTag());
            webURL.setAnchor(urlAnchorPair.getAnchor());
            outgoingUrls.add(webURL);
            if (++urlCount <= this.config.getMaxOutgoingLinksToFollow()) continue;
            break;
        }
        parseData.setOutgoingUrls(outgoingUrls);
        try {
            if (page.getContentCharset() == null) {
                parseData.setHtml(new String(page.getContentData()));
            } else {
                parseData.setHtml(new String(page.getContentData(), page.getContentCharset()));
            }
            page.setParseData(parseData);
            return;
        }
        catch (UnsupportedEncodingException e) {
            logger.error("error parsing the html: " + page.getWebURL().getURL(), (Throwable)e);
            throw new ParseException();
        }
    }
}

