/*
 * Decompiled with CFR 0.152.
 */
package org.languagetool.dev.bigdata;

import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.file.Path;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.LongField;
import org.apache.lucene.document.StringField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.DocValuesType;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.jetbrains.annotations.NotNull;
import org.languagetool.Language;
import org.languagetool.Languages;
import org.languagetool.dev.eval.SimpleCorpusEvaluator;
import org.languagetool.rules.en.GoogleStyleWordTokenizer;
import org.languagetool.tokenizers.SentenceTokenizer;
import org.languagetool.tokenizers.Tokenizer;
import org.tukaani.xz.XZInputStream;

class CommonCrawlToNgram
implements AutoCloseable {
    private static final double THRESHOLD = 1.0E-11;
    private static final int MAX_TOKEN_LENGTH = 20;
    private final File input;
    private final File indexTopDir;
    private final File evalFile;
    private final SentenceTokenizer sentenceTokenizer;
    private final Tokenizer wordTokenizer;
    private final Map<String, Long> unigramToCount = new HashMap<String, Long>();
    private final Map<String, Long> bigramToCount = new HashMap<String, Long>();
    private final Map<String, Long> trigramToCount = new HashMap<String, Long>();
    private final Map<Integer, LuceneLiveIndex> indexes = new HashMap<Integer, LuceneLiveIndex>();
    private int cacheLimit = 1000000;
    private long charCount = 0L;
    private long lineCount = 0L;

    CommonCrawlToNgram(Language language, File input, File indexTopDir, File evalFile) throws IOException {
        this.input = input;
        this.indexTopDir = indexTopDir;
        this.evalFile = evalFile;
        this.sentenceTokenizer = language.getSentenceTokenizer();
        this.wordTokenizer = new GoogleStyleWordTokenizer();
        this.indexes.put(1, new LuceneLiveIndex(new File(indexTopDir, "1grams")));
        this.indexes.put(2, new LuceneLiveIndex(new File(indexTopDir, "2grams")));
        this.indexes.put(3, new LuceneLiveIndex(new File(indexTopDir, "3grams")));
    }

    @Override
    public void close() throws IOException {
        for (LuceneLiveIndex index : this.indexes.values()) {
            index.close();
        }
    }

    void setCacheLimit(int cacheLimit) {
        this.cacheLimit = cacheLimit;
    }

    void indexInputFile() throws IOException {
        this.writeAndEvaluate();
        FileInputStream fin = new FileInputStream(this.input);
        BufferedInputStream in = new BufferedInputStream(fin);
        try (XZInputStream xzIn = new XZInputStream((InputStream)in);){
            int n;
            byte[] buffer = new byte[8192];
            while ((n = xzIn.read(buffer)) != -1) {
                String buf = new String(buffer, 0, n);
                String[] lines = buf.split("\n");
                this.indexLine(lines);
            }
        }
        this.writeAndEvaluate();
    }

    private void indexLine(String[] lines) throws IOException {
        for (String line : lines) {
            if (this.lineCount++ % 50000L == 0L) {
                float mb = (float)this.charCount / 1000.0f / 1000.0f;
                System.out.printf(Locale.ENGLISH, "Indexing line %d (%.2fMB)\n", this.lineCount, Float.valueOf(mb));
            }
            this.charCount += (long)line.length();
            List sentences = this.sentenceTokenizer.tokenize(line);
            for (String sentence : sentences) {
                this.indexSentence(sentence);
            }
        }
    }

    private void indexSentence(String sentence) throws IOException {
        List tokens = this.wordTokenizer.tokenize(sentence);
        tokens.add(0, "_START_");
        tokens.add("_END_");
        String prevPrev = null;
        String prev = null;
        for (String token : tokens) {
            String ngram;
            if (token.trim().isEmpty()) continue;
            if (token.length() <= 20) {
                this.unigramToCount.compute(token, (k, v) -> v == null ? 1L : v + 1L);
            }
            if (prev != null && token.length() <= 20 && prev.length() <= 20) {
                ngram = prev + " " + token;
                this.bigramToCount.compute(ngram, (k, v) -> v == null ? 1L : v + 1L);
            }
            if (prevPrev != null && prev != null) {
                if (token.length() <= 20 && prev.length() <= 20 && prevPrev.length() <= 20) {
                    ngram = prevPrev + " " + prev + " " + token;
                    this.trigramToCount.compute(ngram, (k, v) -> v == null ? 1L : v + 1L);
                }
                if (this.trigramToCount.size() > this.cacheLimit) {
                    this.writeAndEvaluate();
                }
            }
            prevPrev = prev;
            prev = token;
        }
    }

    private void writeAndEvaluate() throws IOException {
        this.writeToLucene(1, this.unigramToCount);
        this.writeToLucene(2, this.bigramToCount);
        this.writeToLucene(3, this.trigramToCount);
        if (this.evalFile != null) {
            System.out.println("Running evaluation...");
            long startTime = System.currentTimeMillis();
            SimpleCorpusEvaluator evaluator = new SimpleCorpusEvaluator(this.indexTopDir);
            evaluator.run(this.evalFile, 1.0E-11);
            System.out.println("Eval time: " + (System.currentTimeMillis() - startTime) + "ms");
        } else {
            System.out.println("Skipping evaluation, no evaluation file specified");
        }
    }

    private void writeToLucene(int ngramSize, Map<String, Long> ngramToCount) throws IOException {
        long startTime = System.currentTimeMillis();
        System.out.println("Writing " + ngramToCount.size() + " cached ngrams to Lucene index (ngramSize=" + ngramSize + ")...");
        LuceneLiveIndex index = this.indexes.get(ngramSize);
        index.reader = DirectoryReader.open((IndexWriter)index.indexWriter, (boolean)true);
        index.searcher = new IndexSearcher((IndexReader)index.reader);
        for (Map.Entry<String, Long> entry : ngramToCount.entrySet()) {
            Term ngram = new Term("ngram", entry.getKey());
            TopDocs topDocs = index.searcher.search((Query)new TermQuery(ngram), 2);
            if (topDocs.totalHits == 0) {
                Document doc = this.getDoc(entry.getKey(), entry.getValue());
                index.indexWriter.addDocument((Iterable)doc);
                continue;
            }
            if (topDocs.totalHits == 1) {
                int docNumber = topDocs.scoreDocs[0].doc;
                Document document = index.reader.document(docNumber);
                long oldCount = Long.parseLong(document.getField("count").stringValue());
                index.indexWriter.deleteDocuments(new Term[]{ngram});
                index.indexWriter.addDocument((Iterable)this.getDoc(entry.getKey(), oldCount + entry.getValue()));
                continue;
            }
            if (topDocs.totalHits <= 1) continue;
            throw new RuntimeException("Got more than one hit for: " + ngram);
        }
        if (ngramSize == 1) {
            long total = ngramToCount.values().stream().mapToLong(Number::longValue).sum();
            System.out.println("Adding totalTokenCount doc: " + total);
            this.addTotalTokenCountDoc(total, index.indexWriter);
        }
        System.out.println("Commit...");
        index.indexWriter.commit();
        System.out.println("Commit done, indexing took " + (System.currentTimeMillis() - startTime) + "ms");
        ngramToCount.clear();
    }

    @NotNull
    private Document getDoc(String ngram, long count) {
        Document doc = new Document();
        doc.add((IndexableField)new Field("ngram", ngram, StringField.TYPE_NOT_STORED));
        doc.add((IndexableField)this.getCountField(count));
        return doc;
    }

    @NotNull
    private LongField getCountField(long count) {
        FieldType fieldType = new FieldType();
        fieldType.setStored(true);
        fieldType.setOmitNorms(true);
        fieldType.setNumericType(FieldType.NumericType.LONG);
        fieldType.setDocValuesType(DocValuesType.NUMERIC);
        return new LongField("count", count, fieldType);
    }

    private void addTotalTokenCountDoc(long totalTokenCount, IndexWriter writer) throws IOException {
        FieldType fieldType = new FieldType();
        fieldType.setIndexOptions(IndexOptions.DOCS);
        fieldType.setStored(true);
        fieldType.setOmitNorms(true);
        Field countField = new Field("totalTokenCount", String.valueOf(totalTokenCount), fieldType);
        Document doc = new Document();
        doc.add((IndexableField)countField);
        writer.addDocument((Iterable)doc);
    }

    public static void main(String[] args) throws IOException {
        if (args.length != 4) {
            System.out.println("Usage: " + CommonCrawlToNgram.class + " <langCode> <input.xz> <ngramIndexDir> <simpleEvalFile>");
            System.out.println(" <simpleEvalFile> a plain text file with simple error markup");
            System.exit(1);
        }
        Language language = Languages.getLanguageForShortCode((String)args[0]);
        File input = new File(args[1]);
        File outputDir = new File(args[2]);
        File evalFile = new File(args[3]);
        try (CommonCrawlToNgram prg = new CommonCrawlToNgram(language, input, outputDir, evalFile);){
            prg.indexInputFile();
        }
    }

    static class LuceneLiveIndex {
        private final Directory directory;
        private final IndexWriter indexWriter;
        private DirectoryReader reader;
        private IndexSearcher searcher;

        LuceneLiveIndex(File dir) throws IOException {
            StandardAnalyzer analyzer = new StandardAnalyzer();
            IndexWriterConfig config = new IndexWriterConfig((Analyzer)analyzer);
            this.directory = FSDirectory.open((Path)dir.toPath());
            this.indexWriter = new IndexWriter(this.directory, config);
            this.reader = DirectoryReader.open((IndexWriter)this.indexWriter, (boolean)false);
            this.searcher = new IndexSearcher((IndexReader)this.reader);
        }

        void close() throws IOException {
            this.reader.close();
            this.indexWriter.close();
            this.directory.close();
        }
    }
}

