/*
 * Decompiled with CFR 0.152.
 */
package org.languagetool.dev.bigdata;

import java.io.File;
import java.io.IOException;
import java.nio.file.Path;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Locale;
import java.util.Map;
import java.util.Scanner;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.LongField;
import org.apache.lucene.document.StringField;
import org.apache.lucene.index.DocValuesType;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.jetbrains.annotations.NotNull;

class AggregatedNgramToLucene
implements AutoCloseable {
    private final Map<Integer, LuceneIndex> indexes = new HashMap<Integer, LuceneIndex>();
    private long totalTokenCount = 0L;
    private long lineCount = 0L;

    AggregatedNgramToLucene(File indexTopDir) throws IOException {
        this.indexes.put(1, new LuceneIndex(new File(indexTopDir, "1grams")));
        this.indexes.put(2, new LuceneIndex(new File(indexTopDir, "2grams")));
        this.indexes.put(3, new LuceneIndex(new File(indexTopDir, "3grams")));
    }

    @Override
    public void close() throws IOException {
        for (LuceneIndex index : this.indexes.values()) {
            index.close();
        }
    }

    void indexInputFile(File file) throws IOException {
        System.out.println("=== Indexing " + file + " ===");
        try (Scanner scanner = new Scanner(file);){
            while (scanner.hasNextLine()) {
                String line = scanner.nextLine();
                this.indexLine(line);
            }
        }
    }

    private void indexLine(String line) throws IOException {
        Object[] lineParts;
        if (this.lineCount++ % 250000L == 0L) {
            System.out.printf(Locale.ENGLISH, "Indexing line %d\n", this.lineCount);
        }
        if ((lineParts = line.split("\t")).length != 2) {
            System.err.println("Not 2 parts but " + lineParts.length + ", ignoring: '" + line + "'");
            return;
        }
        String ngram = lineParts[0];
        String[] ngramParts = ngram.split(" ");
        LuceneIndex index = this.indexes.get(ngramParts.length);
        if (index == null) {
            throw new RuntimeException("No ngram data found for: " + Arrays.toString(lineParts));
        }
        long count = Long.parseLong(lineParts[1]);
        if (ngramParts.length == 1) {
            this.totalTokenCount += count;
        }
        index.indexWriter.addDocument((Iterable)this.getDoc(ngram, count));
    }

    @NotNull
    private Document getDoc(String ngram, long count) {
        Document doc = new Document();
        doc.add((IndexableField)new Field("ngram", ngram, StringField.TYPE_NOT_STORED));
        doc.add((IndexableField)this.getCountField(count));
        return doc;
    }

    @NotNull
    private LongField getCountField(long count) {
        FieldType fieldType = new FieldType();
        fieldType.setStored(true);
        fieldType.setOmitNorms(true);
        fieldType.setNumericType(FieldType.NumericType.LONG);
        fieldType.setDocValuesType(DocValuesType.NUMERIC);
        return new LongField("count", count, fieldType);
    }

    private void addTotalTokenCountDoc(long totalTokenCount, IndexWriter writer) throws IOException {
        FieldType fieldType = new FieldType();
        fieldType.setIndexOptions(IndexOptions.DOCS);
        fieldType.setStored(true);
        fieldType.setOmitNorms(true);
        Field countField = new Field("totalTokenCount", String.valueOf(totalTokenCount), fieldType);
        Document doc = new Document();
        doc.add((IndexableField)countField);
        writer.addDocument((Iterable)doc);
    }

    public static void main(String[] args) throws IOException {
        if (args.length != 1) {
            System.out.println("Usage: " + AggregatedNgramToLucene.class + " <inputDir>");
            System.out.println(" <inputDir> is a directory with aggregated ngram files from Hadoop, e.g. produced by CommonCrawlNGramJob");
            System.exit(1);
        }
        File inputDir = new File(args[0]);
        File outputDir = new File(inputDir, "index");
        System.out.println("Indexing to " + outputDir);
        try (AggregatedNgramToLucene prg = new AggregatedNgramToLucene(outputDir);){
            for (File file : inputDir.listFiles()) {
                if (!file.isFile()) continue;
                prg.indexInputFile(file);
            }
            prg.addTotalTokenCountDoc(prg.totalTokenCount, prg.indexes.get((Object)Integer.valueOf((int)1)).indexWriter);
        }
    }

    static class LuceneIndex {
        private final Directory directory;
        private final IndexWriter indexWriter;

        LuceneIndex(File dir) throws IOException {
            StandardAnalyzer analyzer = new StandardAnalyzer();
            IndexWriterConfig config = new IndexWriterConfig((Analyzer)analyzer);
            this.directory = FSDirectory.open((Path)dir.toPath());
            this.indexWriter = new IndexWriter(this.directory, config);
        }

        void close() throws IOException {
            this.indexWriter.close();
            this.directory.close();
        }
    }
}

