/*
 * Decompiled with CFR 0.152.
 */
package org.languagetool.dev.bigdata;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.nio.file.Path;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.RegexpQuery;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.languagetool.JLanguageTool;
import org.languagetool.Language;
import org.languagetool.Languages;
import org.languagetool.dev.bigdata.ProhibitedCompoundRuleEvaluator;
import org.languagetool.dev.bigdata.RuleEvalResult;
import org.languagetool.languagemodel.LanguageModel;
import org.languagetool.languagemodel.LuceneLanguageModel;
import org.languagetool.rules.ConfusionPair;
import org.languagetool.rules.ConfusionSetLoader;

class AutomaticProhibitedCompoundRuleEvaluator {
    private static final String LANGUAGE = "de";
    private static final int MAX_EXAMPLES = 1000;
    private static final int MIN_EXAMPLES = 50;
    private static final List<Long> EVAL_FACTORS = Arrays.asList(10L);
    private static final float MIN_PRECISION = 0.95f;
    private static final float MIN_RECALL = 0.1f;
    private static final String LUCENE_CONTENT_FIELD = "fieldLowercase";
    private final IndexSearcher searcher;
    private final Map<String, List<ConfusionPair>> knownSets;
    private final Set<String> finishedPairs = new HashSet<String>();
    private final Language language = Languages.getLanguageForShortCode((String)"de");
    private int ignored = 0;

    AutomaticProhibitedCompoundRuleEvaluator(File luceneIndexDir) throws IOException {
        DirectoryReader reader = DirectoryReader.open((Directory)FSDirectory.open((Path)luceneIndexDir.toPath()));
        this.searcher = new IndexSearcher((IndexReader)reader);
        InputStream confusionSetStream = JLanguageTool.getDataBroker().getFromResourceDirAsStream("/de/confusion_sets.txt");
        this.knownSets = new ConfusionSetLoader(this.language).loadConfusionPairs(confusionSetStream);
    }

    private void run(List<String> lines, File indexDir) throws IOException {
        LuceneLanguageModel lm = new LuceneLanguageModel(indexDir);
        ProhibitedCompoundRuleEvaluator evaluator = new ProhibitedCompoundRuleEvaluator(this.language, (LanguageModel)lm);
        int lineCount = 0;
        for (String line : lines) {
            ++lineCount;
            if (line.contains("#")) {
                System.out.println("Ignoring: " + line);
                continue;
            }
            String[] parts = line.split(";\\s*");
            if (parts.length != 2) {
                throw new IOException("Expected semicolon-separated input: " + line);
            }
            try {
                int i = 1;
                for (String part : parts) {
                    if (i < parts.length) {
                        this.runOnPair(evaluator, line, lineCount, lines.size(), this.removeComment(part), this.removeComment(parts[i]));
                    }
                    ++i;
                }
            }
            catch (RuntimeException e) {
                e.printStackTrace();
            }
        }
        System.out.println("Done. Ignored items because they are already known: " + this.ignored);
    }

    private String removeComment(String str) {
        return str.replaceFirst("\\|.*", "").trim();
    }

    private void runOnPair(ProhibitedCompoundRuleEvaluator evaluator, String line, int lineCount, int totalLines, String part1, String part2) throws IOException {
        if (this.finishedPairs.contains(part1 + "/" + part2) || this.finishedPairs.contains(part2 + "/" + part1)) {
            System.out.println("Ignoring: " + part1 + "/" + part2 + ", finished before");
            return;
        }
        for (Map.Entry<String, List<ConfusionPair>> entry : this.knownSets.entrySet()) {
            if (!entry.getKey().equals(part1)) continue;
            List<ConfusionPair> confusionPair = entry.getValue();
            for (ConfusionPair pair : confusionPair) {
                Set stringSet = pair.getTerms().stream().map(l -> l.getString()).collect(Collectors.toSet());
                if (!stringSet.containsAll(Arrays.asList(part1, part2))) continue;
                System.out.println("Ignoring: " + part1 + "/" + part2 + ", in active confusion sets already");
                ++this.ignored;
                return;
            }
        }
        System.out.println("Working on: " + line + " (" + lineCount + " of " + totalLines + ")");
        try {
            File sentencesFile = this.writeExampleSentencesToTempFile(new String[]{part1, part2});
            List<String> input = Arrays.asList(sentencesFile.getAbsolutePath());
            Map<Long, RuleEvalResult> results = evaluator.run(input, part1, part2, 1000, EVAL_FACTORS);
            Map<Long, RuleEvalResult> bestResults = this.findBestFactor(results);
            if (bestResults.size() > 0) {
                for (Map.Entry<Long, RuleEvalResult> entry : bestResults.entrySet()) {
                    System.out.println("=> " + entry.getValue().getSummary());
                }
            } else {
                System.out.println("No good result found for " + part1 + "/" + part2);
            }
            this.finishedPairs.add(part1 + "/" + part2);
        }
        catch (TooFewExamples e) {
            System.out.println("Skipping " + part1 + "/" + part2 + ", too few examples: " + e.getMessage());
        }
    }

    private Map<Long, RuleEvalResult> findBestFactor(Map<Long, RuleEvalResult> results) {
        HashMap<Long, RuleEvalResult> filteredResults = new HashMap<Long, RuleEvalResult>();
        for (Map.Entry<Long, RuleEvalResult> entry : results.entrySet()) {
            RuleEvalResult result = entry.getValue();
            boolean candidate = result.getPrecision() >= 0.95f && result.getRecall() >= 0.1f;
            if (!candidate) continue;
            filteredResults.put(entry.getKey(), entry.getValue());
        }
        return filteredResults;
    }

    private File writeExampleSentencesToTempFile(String[] words) throws IOException {
        File tempFile = new File("/tmp/example-sentences.txt");
        int count = 0;
        try (FileWriter fw = new FileWriter(tempFile);){
            for (String word : words) {
                int tmpCount = this.findExampleSentences(word, fw);
                if (tmpCount <= 50) {
                    throw new TooFewExamples(word, tmpCount);
                }
                count += tmpCount;
            }
            System.out.println(count + " example sentences written to " + tempFile);
        }
        return tempFile;
    }

    private int findExampleSentences(String word, FileWriter fw) throws IOException {
        Term term = new Term(LUCENE_CONTENT_FIELD, ".+" + word + "|" + StringUtils.capitalize((String)word) + ".+");
        long t1 = System.currentTimeMillis();
        TopDocs topDocs = this.searcher.search((Query)new RegexpQuery(term), 1000);
        long t2 = System.currentTimeMillis();
        int count = 0;
        HashSet<String> foundSentences = new HashSet<String>();
        for (ScoreDoc scoreDoc : topDocs.scoreDocs) {
            String sentence = this.searcher.doc(scoreDoc.doc).get(LUCENE_CONTENT_FIELD);
            if (!foundSentences.contains(sentence)) {
                fw.write(sentence + "\n");
                foundSentences.add(sentence);
                ++count;
            }
            if (count > 1000) break;
        }
        long t3 = System.currentTimeMillis();
        long searchTime = t2 - t1;
        long iterateTime = t3 - t2;
        System.out.println("Found " + count + " examples for " + word + " (" + searchTime + "ms, " + iterateTime + "ms)");
        return count;
    }

    public static void main(String[] args) throws IOException {
        if (args.length != 3) {
            System.out.println("Usage: " + AutomaticProhibitedCompoundRuleEvaluator.class.getSimpleName() + " <confusionPairCandidates> <exampleSentenceIndexDir> <ngramDir>");
            System.out.println("   <confusionPairCandidates> is a semicolon-separated list of words (one pair per line)");
            System.out.println("   <exampleSentenceIndexDir> is a Lucene index created by TextIndexCreator");
            System.exit(1);
        }
        List lines = IOUtils.readLines((InputStream)new FileInputStream(args[0]), (String)"utf-8");
        AutomaticProhibitedCompoundRuleEvaluator eval = new AutomaticProhibitedCompoundRuleEvaluator(new File(args[1]));
        eval.run(lines, new File(args[2]));
    }

    static class TooFewExamples
    extends RuntimeException {
        private final String word;
        private final int exampleCount;

        TooFewExamples(String word, int exampleCount) {
            this.word = word;
            this.exampleCount = exampleCount;
        }

        @Override
        public String getMessage() {
            return this.exampleCount + " matches for " + this.word;
        }
    }
}

