/*
 * Decompiled with CFR 0.152.
 */
package org.languagetool.dev.bigdata;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
import org.apache.lucene.search.SortedNumericSortField;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopFieldDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.languagetool.JLanguageTool;
import org.languagetool.Language;
import org.languagetool.Languages;
import org.languagetool.dev.bigdata.ConfusionRuleEvaluator;
import org.languagetool.dev.bigdata.RuleEvalResult;
import org.languagetool.languagemodel.LanguageModel;
import org.languagetool.languagemodel.LuceneLanguageModel;
import org.languagetool.rules.ConfusionPair;
import org.languagetool.rules.ConfusionSetLoader;

class AutomaticConfusionRuleEvaluator {
    private static final int MAX_EXAMPLES = 2000;
    private static final int MIN_EXAMPLES = 50;
    private static final long EVAL_FACTORS_MIN = 10L;
    private static final long EVAL_FACTORS_MAX = 10000000L;
    private static final List<Long> EVAL_FACTORS = new ArrayList<Long>();
    private static final float MIN_PRECISION = 0.95f;
    private static final float MIN_RECALL = 0.1f;
    private static final Map<String, Map<String, Integer>> wordToSources;
    private final IndexSearcher searcher;
    private final Map<String, List<ConfusionPair>> knownSets;
    private final Set<String> finishedPairs = new HashSet<String>();
    private final String fieldName;
    private final boolean caseInsensitive;
    private final Language lang;
    private int ignored = 0;

    private AutomaticConfusionRuleEvaluator(File luceneIndexDir, String fieldName, boolean caseInsensitive, Language lang) throws IOException {
        this.fieldName = fieldName;
        this.caseInsensitive = caseInsensitive;
        System.out.println("Using " + luceneIndexDir + " to search example sentences");
        DirectoryReader reader = DirectoryReader.open((Directory)FSDirectory.open((Path)luceneIndexDir.toPath()));
        this.searcher = new IndexSearcher((IndexReader)reader);
        InputStream confusionSetStream = JLanguageTool.getDataBroker().getFromResourceDirAsStream("/" + lang.getShortCode() + "/confusion_sets.txt");
        this.knownSets = new ConfusionSetLoader(lang).loadConfusionPairs(confusionSetStream);
        this.lang = lang;
    }

    private void run(List<String> lines, File indexDir) throws IOException {
        LuceneLanguageModel lm = new LuceneLanguageModel(indexDir);
        int lineCount = 0;
        for (String line : lines) {
            ++lineCount;
            if (line.isEmpty()) continue;
            if (line.contains("#")) {
                System.out.println("Ignoring: " + line);
                continue;
            }
            System.out.printf(Locale.ENGLISH, "Line " + lineCount + " of " + lines.size() + " (%.2f%%)\n", Float.valueOf((float)lineCount / (float)lines.size() * 100.0f));
            String[] parts = line.split("\\s*(;|->)\\s*");
            boolean bothDirections = false;
            ConfusionRuleEvaluator evaluator = new ConfusionRuleEvaluator(this.lang, (LanguageModel)lm, this.caseInsensitive, bothDirections);
            try {
                for (String part1 : parts) {
                    for (String part2 : parts) {
                        if (part1.equals(part2)) continue;
                        if (bothDirections) {
                            this.runOnPair(evaluator, line, lineCount, lines.size(), this.removeComment(part1), this.removeComment(part2), bothDirections);
                            continue;
                        }
                        this.runOnPair(evaluator, line, lineCount, lines.size(), this.removeComment(part1), this.removeComment(part2), false);
                        this.runOnPair(evaluator, line, lineCount, lines.size(), this.removeComment(part2), this.removeComment(part1), false);
                    }
                }
            }
            catch (RuntimeException e) {
                e.printStackTrace();
            }
        }
        System.out.println("Done. Ignored items because they are already known: " + this.ignored);
    }

    private String removeComment(String str) {
        return str.replaceFirst("\\|.*", "");
    }

    private void runOnPair(ConfusionRuleEvaluator evaluator, String line, int lineCount, int totalLines, String part1, String part2, boolean bothDirections) throws IOException {
        boolean finishedBefore;
        boolean bl = bothDirections ? this.finishedPairs.contains(part1 + "/" + part2) || this.finishedPairs.contains(part2 + "/" + part1) : (finishedBefore = this.finishedPairs.contains(part1 + "/" + part2));
        if (finishedBefore) {
            System.out.println("Ignoring: " + part1 + "/" + part2 + ", finished before");
            return;
        }
        boolean evalNewsSets = true;
        boolean use = false;
        long existingFactor = 0L;
        for (Map.Entry<String, List<ConfusionPair>> entry : this.knownSets.entrySet()) {
            if (!entry.getKey().equals(part1)) continue;
            List<ConfusionPair> confusionPairs = entry.getValue();
            for (ConfusionPair pair : confusionPairs) {
                Set stringSet = pair.getTerms().stream().map(l -> l.getString()).collect(Collectors.toSet());
                if (!stringSet.containsAll(Arrays.asList(part1, part2))) continue;
                System.out.println("Ignoring: " + part1 + "/" + part2 + ", in active confusion sets already");
                if (evalNewsSets) {
                    ++this.ignored;
                    return;
                }
                use = true;
                existingFactor = pair.getFactor();
            }
        }
        if (!evalNewsSets && !use) {
            System.out.println("Skipping, evalNewsSets=false and pair not known yet");
            return;
        }
        System.out.println("Working on: '" + part1 + "' / '" + part2 + "' from line: " + line + " (" + lineCount + " of " + totalLines + ")");
        try {
            File sentencesFile = this.writeExampleSentencesToTempFile(new String[]{part1, part2});
            List<String> input = Arrays.asList(sentencesFile.getAbsolutePath());
            Map<Long, RuleEvalResult> results = evaluator.run(input, part1, part2, 2000, evalNewsSets ? EVAL_FACTORS : Collections.singletonList(existingFactor), wordToSources.get(part1), wordToSources.get(part2));
            Map<Long, RuleEvalResult> bestResults = this.findBestFactor(results);
            if (bestResults.size() > 0) {
                for (Map.Entry<Long, RuleEvalResult> entry : bestResults.entrySet()) {
                    System.out.println("=> " + entry.getValue().getSummary());
                }
            } else {
                System.out.println("No good result found for " + part1 + "/" + part2);
            }
            this.finishedPairs.add(part1 + "/" + part2);
        }
        catch (TooFewExamples e) {
            System.out.println("Skipping " + part1 + "/" + part2 + ", too few examples: " + e.getMessage());
        }
    }

    private Map<Long, RuleEvalResult> findBestFactor(Map<Long, RuleEvalResult> results) {
        LinkedHashMap<Long, RuleEvalResult> filteredResults = new LinkedHashMap<Long, RuleEvalResult>();
        for (Map.Entry<Long, RuleEvalResult> entry : results.entrySet()) {
            RuleEvalResult result = entry.getValue();
            boolean candidate = result.getPrecision() >= 0.95f && result.getRecall() >= 0.1f;
            if (!candidate) continue;
            filteredResults.put(entry.getKey(), entry.getValue());
        }
        return filteredResults;
    }

    private File writeExampleSentencesToTempFile(String[] words) throws IOException {
        File tempFile = new File(System.getProperty("java.io.tmpdir"), "example-sentences.txt");
        int count = 0;
        try (FileWriter fw = new FileWriter(tempFile);){
            for (String word : words) {
                Map<String, Integer> sourceToCount = this.findExampleSentences(word, fw);
                wordToSources.put(word, sourceToCount);
                int tmpCount = sourceToCount.values().stream().reduce(Integer::sum).get();
                if (tmpCount <= 50) {
                    throw new TooFewExamples(word, tmpCount);
                }
                count += tmpCount;
            }
            System.out.println(count + " example sentences written to " + tempFile);
        }
        return tempFile;
    }

    private Map<String, Integer> findExampleSentences(String word, FileWriter fw) throws IOException {
        Term term = new Term(this.fieldName, this.caseInsensitive ? word.toLowerCase() : word);
        long t1 = System.currentTimeMillis();
        TopFieldDocs topDocs = this.searcher.search((Query)new TermQuery(term), 20000, new Sort((SortField)new SortedNumericSortField("random", SortField.Type.INT)));
        long t2 = System.currentTimeMillis();
        int count = 0;
        HashSet<String> foundSentences = new HashSet<String>();
        HashMap<String, Integer> sourceToCount = new HashMap<String, Integer>();
        for (ScoreDoc scoreDoc : topDocs.scoreDocs) {
            Document doc = this.searcher.doc(scoreDoc.doc);
            String sentence = doc.get(this.fieldName);
            int occCount = this.countRegexMatches(sentence, word);
            if (occCount > 1) continue;
            if (this.caseInsensitive) {
                if (!foundSentences.contains(sentence)) {
                    this.writeSentence(fw, foundSentences, sentence, doc, sourceToCount);
                    ++count;
                }
            } else if (sentence.contains(word) && !foundSentences.contains(sentence)) {
                this.writeSentence(fw, foundSentences, sentence, doc, sourceToCount);
                ++count;
            }
            if (count > 2000) break;
        }
        long t3 = System.currentTimeMillis();
        long searchTime = t2 - t1;
        long iterateTime = t3 - t2;
        System.out.println("Found " + count + " examples for " + word + " (" + searchTime + "ms, " + iterateTime + "ms), case insensitive=" + this.caseInsensitive + ", totalHits: " + topDocs.totalHits + " for term '" + term + "'");
        System.out.println("Sources: " + sourceToCount);
        return sourceToCount;
    }

    private void writeSentence(FileWriter fw, Set<String> foundSentences, String sentence, Document doc, Map<String, Integer> sourceToCount) throws IOException {
        fw.write(sentence + "\n");
        foundSentences.add(sentence);
        String source = doc.get("source");
        sourceToCount.put(source, sourceToCount.getOrDefault(source, 1) + 1);
    }

    private int countRegexMatches(String sentence, String word) {
        int count = 0;
        Matcher matcher = Pattern.compile("\\b" + word + "\\b").matcher(sentence);
        while (matcher.find()) {
            ++count;
        }
        return count;
    }

    public static void main(String[] args) throws IOException {
        if (args.length != 6) {
            System.out.println("Usage: " + AutomaticConfusionRuleEvaluator.class.getSimpleName() + " <languageCode> <confusionPairCandidates> <exampleSentenceIndexDir> <ngramDir> <fieldName> <true|false>");
            System.out.println("   <confusionPairCandidates> is a semicolon-separated list of words (one pair per line)");
            System.out.println("   <exampleSentenceIndexDir> is a Lucene index created by TextIndexCreator");
            System.out.println("   <fieldName> is the Lucene index field name, usually 'field' or 'fieldLowercase'");
            System.out.println("   <true|false> whether to run in case-insensitive mode");
            System.exit(1);
        }
        Language lang = Languages.getLanguageForShortCode((String)args[0]);
        List lines = IOUtils.readLines((InputStream)new FileInputStream(args[1]), (String)"utf-8");
        boolean caseInsensitive = args[5].equalsIgnoreCase("true");
        AutomaticConfusionRuleEvaluator eval = new AutomaticConfusionRuleEvaluator(new File(args[2]), args[4], caseInsensitive, lang);
        eval.run(lines, new File(args[3]));
    }

    static {
        for (long i = 10L; i < 10000000L; i *= 10L) {
            EVAL_FACTORS.add(i);
        }
        wordToSources = new HashMap<String, Map<String, Integer>>();
    }

    static class TooFewExamples
    extends RuntimeException {
        private final String word;
        private final int exampleCount;

        TooFewExamples(String word, int exampleCount) {
            this.word = word;
            this.exampleCount = exampleCount;
        }

        @Override
        public String getMessage() {
            return this.exampleCount + " matches for " + this.word;
        }
    }
}

