/*
 * Decompiled with CFR 0.152.
 */
package gate.creole.splitter;

import gate.AnnotationSet;
import gate.Factory;
import gate.FeatureMap;
import gate.Resource;
import gate.creole.AbstractLanguageAnalyser;
import gate.creole.ExecutionException;
import gate.creole.ResourceInstantiationException;
import gate.creole.ResourceReference;
import gate.creole.metadata.CreoleParameter;
import gate.creole.metadata.CreoleResource;
import gate.creole.metadata.Optional;
import gate.creole.metadata.RunTime;
import gate.util.BomStrippingInputStreamReader;
import gate.util.InvalidOffsetException;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.URISyntaxException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.regex.MatchResult;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

@CreoleResource(name="RegEx Sentence Splitter", icon="sentence-splitter", comment="A sentence splitter based on regular expressions.", helpURL="http://gate.ac.uk/userguide/sec:annie:regex-splitter")
public class RegexSentenceSplitter
extends AbstractLanguageAnalyser {
    public static final String SPLIT_DOCUMENT_PARAMETER_NAME = "document";
    public static final String SPLIT_INPUT_AS_PARAMETER_NAME = "inputASName";
    public static final String SPLIT_OUTPUT_AS_PARAMETER_NAME = "outputASName";
    public static final String SPLIT_ENCODING_PARAMETER_NAME = "encoding";
    public static final String SPLIT_SPLIT_LIST_PARAMETER_NAME = "splitListURL";
    public static final String SPLIT_NON_SPLIT_LIST_PARAMETER_NAME = "nonSplitListURL";
    private static final long serialVersionUID = 1L;
    protected String outputASName;
    protected String encoding;
    protected ResourceReference internalSplitListURL;
    protected ResourceReference externalSplitListURL;
    protected ResourceReference nonSplitListURL;
    protected Pattern internalSplitsPattern;
    protected Pattern externalSplitsPattern;
    protected Pattern nonSplitsPattern;

    protected Pattern compilePattern(URL paternsListUrl, String encoding) throws UnsupportedEncodingException, IOException {
        StringBuffer patternString = new StringBuffer();
        try (BomStrippingInputStreamReader reader = new BomStrippingInputStreamReader(paternsListUrl.openStream(), encoding);){
            String line = reader.readLine();
            while (line != null) {
                if ((line = line.trim()).length() != 0 && !line.startsWith("//")) {
                    if (patternString.length() > 0) {
                        patternString.append("|");
                    }
                    patternString.append("(?:" + line + ")");
                }
                line = reader.readLine();
            }
        }
        return Pattern.compile(patternString.toString());
    }

    public void execute() throws ExecutionException {
        this.interrupted = false;
        int lastProgress = 0;
        this.fireProgressChanged(lastProgress);
        AnnotationSet outputAS = this.outputASName == null || this.outputASName.trim().length() == 0 ? this.document.getAnnotations() : this.document.getAnnotations(this.outputASName);
        String docText = this.document.getContent().toString();
        if (docText.trim().length() < 1) {
            return;
        }
        Matcher internalSplitMatcher = this.internalSplitsPattern.matcher(docText);
        Matcher externalSplitMatcher = this.externalSplitsPattern.matcher(docText);
        Matcher nonSplitMatcher = this.nonSplitsPattern.matcher(docText);
        LinkedList<int[]> nonSplits = new LinkedList<int[]>();
        while (nonSplitMatcher.find()) {
            nonSplits.add(new int[]{nonSplitMatcher.start(), nonSplitMatcher.end()});
        }
        ArrayList<MatchResult> nextSplitMatches = new ArrayList<MatchResult>();
        MatchResult internalMatchResult = null;
        if (internalSplitMatcher.find()) {
            internalMatchResult = internalSplitMatcher.toMatchResult();
            nextSplitMatches.add(internalMatchResult);
        }
        MatchResult externalMatchResult = null;
        if (externalSplitMatcher.find()) {
            externalMatchResult = externalSplitMatcher.toMatchResult();
            nextSplitMatches.add(externalMatchResult);
        }
        MatchResultComparator comparator = new MatchResultComparator();
        int lastSentenceEnd = 0;
        while (!nextSplitMatches.isEmpty()) {
            int newProgress;
            int endOffset;
            Collections.sort(nextSplitMatches, comparator);
            MatchResult nextMatch = (MatchResult)nextSplitMatches.remove(0);
            if (nextMatch == internalMatchResult) {
                if (!this.veto(nextMatch, nonSplits)) {
                    try {
                        FeatureMap features = Factory.newFeatureMap();
                        features.put((Object)"kind", (Object)"internal");
                        outputAS.add(Long.valueOf(nextMatch.start()), Long.valueOf(nextMatch.end()), "Split", features);
                        endOffset = nextMatch.end();
                        while (lastSentenceEnd < endOffset && Character.isWhitespace(Character.codePointAt(docText, lastSentenceEnd))) {
                            ++lastSentenceEnd;
                        }
                        if (lastSentenceEnd < nextMatch.start()) {
                            outputAS.add(Long.valueOf(lastSentenceEnd), Long.valueOf(endOffset), "Sentence", Factory.newFeatureMap());
                        }
                        lastSentenceEnd = endOffset;
                    }
                    catch (InvalidOffsetException e) {
                        throw new ExecutionException((Throwable)e);
                    }
                }
                if (internalSplitMatcher.find()) {
                    internalMatchResult = internalSplitMatcher.toMatchResult();
                    nextSplitMatches.add(internalMatchResult);
                } else {
                    internalMatchResult = null;
                }
            } else if (nextMatch == externalMatchResult) {
                if (!this.veto(nextMatch, nonSplits)) {
                    try {
                        FeatureMap features = Factory.newFeatureMap();
                        features.put((Object)"kind", (Object)"external");
                        outputAS.add(Long.valueOf(nextMatch.start()), Long.valueOf(nextMatch.end()), "Split", features);
                        for (endOffset = nextMatch.start(); endOffset > lastSentenceEnd && Character.isSpaceChar(Character.codePointAt(docText, endOffset - 1)); --endOffset) {
                        }
                        while (lastSentenceEnd < endOffset && Character.isSpaceChar(Character.codePointAt(docText, lastSentenceEnd))) {
                            ++lastSentenceEnd;
                        }
                        if (lastSentenceEnd < endOffset) {
                            outputAS.add(Long.valueOf(lastSentenceEnd), Long.valueOf(endOffset), "Sentence", Factory.newFeatureMap());
                        }
                        lastSentenceEnd = nextMatch.end();
                    }
                    catch (InvalidOffsetException e) {
                        throw new ExecutionException((Throwable)e);
                    }
                }
                if (externalSplitMatcher.find()) {
                    externalMatchResult = externalSplitMatcher.toMatchResult();
                    nextSplitMatches.add(externalMatchResult);
                } else {
                    externalMatchResult = null;
                }
            } else {
                throw new ExecutionException("Invalid state - cannot identify match!");
            }
            if ((newProgress = 100 * lastSentenceEnd / docText.length()) - lastProgress <= 20) continue;
            lastProgress = newProgress;
            this.fireProgressChanged(lastProgress);
        }
        this.fireProcessFinished();
    }

    private boolean veto(MatchResult split, List<int[]> vetoRegions) {
        Iterator<int[]> vetoRegIter = vetoRegions.iterator();
        while (vetoRegIter.hasNext()) {
            int[] aVetoRegion = vetoRegIter.next();
            if (aVetoRegion[1] - 1 < split.start()) {
                vetoRegIter.remove();
                continue;
            }
            return split.end() - 1 >= aVetoRegion[0];
        }
        return false;
    }

    public Resource init() throws ResourceInstantiationException {
        super.init();
        try {
            if (this.internalSplitListURL == null) {
                throw new ResourceInstantiationException("No list of internal splits provided!");
            }
            if (this.externalSplitListURL == null) {
                throw new ResourceInstantiationException("No list of external splits provided!");
            }
            if (this.nonSplitListURL == null) {
                throw new ResourceInstantiationException("No list of non splits provided!");
            }
            if (this.encoding == null) {
                throw new ResourceInstantiationException("No encoding provided!");
            }
            this.internalSplitsPattern = this.compilePattern(this.internalSplitListURL.toURL(), this.encoding);
            this.externalSplitsPattern = this.compilePattern(this.externalSplitListURL.toURL(), this.encoding);
            this.nonSplitsPattern = this.compilePattern(this.nonSplitListURL.toURL(), this.encoding);
        }
        catch (UnsupportedEncodingException e) {
            throw new ResourceInstantiationException((Exception)e);
        }
        catch (IOException e) {
            throw new ResourceInstantiationException((Exception)e);
        }
        return this;
    }

    public String getOutputASName() {
        return this.outputASName;
    }

    @RunTime
    @Optional
    @CreoleParameter(comment="The annotation set to be used as output for 'Sentence' and 'Split' annotations")
    public void setOutputASName(String outputASName) {
        this.outputASName = outputASName;
    }

    public String getEncoding() {
        return this.encoding;
    }

    @CreoleParameter(comment="The encoding used for reading the definition files", defaultValue="UTF-8")
    public void setEncoding(String encoding) {
        this.encoding = encoding;
    }

    public ResourceReference getInternalSplitListURL() {
        return this.internalSplitListURL;
    }

    @CreoleParameter(defaultValue="resources/regex-splitter/internal-split-patterns.txt", suffixes="txt", comment="The URL to the internal splits pattern list")
    public void setInternalSplitListURL(ResourceReference internalSplitListURL) {
        this.internalSplitListURL = internalSplitListURL;
    }

    @Deprecated
    public void setInternalSplitListURL(URL internalSplitListURL) {
        try {
            this.setInternalSplitListURL(new ResourceReference(internalSplitListURL));
        }
        catch (URISyntaxException e) {
            throw new RuntimeException("Error converting URL to ResourceReference", e);
        }
    }

    public ResourceReference getExternalSplitListURL() {
        return this.externalSplitListURL;
    }

    @CreoleParameter(defaultValue="resources/regex-splitter/external-split-patterns.txt", comment="The URL to the external splits pattern list", suffixes="txt")
    public void setExternalSplitListURL(ResourceReference externalSplitListURL) {
        this.externalSplitListURL = externalSplitListURL;
    }

    @Deprecated
    public void setExternalSplitListURL(URL externalSplitListURL) {
        try {
            this.setExternalSplitListURL(new ResourceReference(externalSplitListURL));
        }
        catch (URISyntaxException e) {
            throw new RuntimeException("Error converting URL to ResourceReference", e);
        }
    }

    public ResourceReference getNonSplitListURL() {
        return this.nonSplitListURL;
    }

    @CreoleParameter(defaultValue="resources/regex-splitter/non-split-patterns.txt", comment="The URL to the non splits pattern list", suffixes="txt")
    public void setNonSplitListURL(ResourceReference nonSplitListURL) {
        this.nonSplitListURL = nonSplitListURL;
    }

    @Deprecated
    public void setNonSplitListURL(URL nonSplitListURL) {
        try {
            this.setNonSplitListURL(new ResourceReference(nonSplitListURL));
        }
        catch (URISyntaxException e) {
            throw new RuntimeException("Error converting URL to ResourceReference", e);
        }
    }

    public Pattern getInternalSplitsPattern() {
        return this.internalSplitsPattern;
    }

    public void setInternalSplitsPattern(Pattern internalSplitsPattern) {
        this.internalSplitsPattern = internalSplitsPattern;
    }

    private class MatchResultComparator
    implements Comparator<MatchResult> {
        private MatchResultComparator() {
        }

        @Override
        public int compare(MatchResult o1, MatchResult o2) {
            if (o1 == null && o2 == null) {
                return 0;
            }
            if (o1 == null) {
                return 1;
            }
            if (o2 == null) {
                return -1;
            }
            return o1.start() - o2.start();
        }
    }
}

