/*
 * Decompiled with CFR 0.152.
 */
package org.languagetool.dev.bigdata;

import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import org.apache.commons.compress.compressors.CompressorException;
import org.apache.commons.compress.compressors.CompressorInputStream;
import org.apache.commons.compress.compressors.CompressorStreamFactory;
import org.languagetool.Language;
import org.languagetool.Languages;
import org.languagetool.rules.en.GoogleStyleWordTokenizer;
import org.languagetool.tokenizers.SentenceTokenizer;
import org.languagetool.tokenizers.Tokenizer;

class CommonCrawlToNgram3
implements AutoCloseable {
    private static final int MAX_TOKEN_LENGTH = 20;
    private static final int MAX_SENTENCE_LENGTH = 50000;
    private static final int CACHE_LIMIT = 1000000;
    private final File input;
    private final SentenceTokenizer sentenceTokenizer;
    private final Tokenizer wordTokenizer;
    private final Map<String, Long> unigramToCount = new HashMap<String, Long>();
    private final Map<String, Long> bigramToCount = new HashMap<String, Long>();
    private final Map<String, Long> trigramToCount = new HashMap<String, Long>();
    private final Map<Integer, FileWriter> ngramSizeToWriter = new HashMap<Integer, FileWriter>();
    private long charCount = 0L;
    private long lineCount = 0L;

    CommonCrawlToNgram3(Language language, File input, File outputDir) throws IOException {
        this.input = input;
        this.sentenceTokenizer = language.getSentenceTokenizer();
        this.wordTokenizer = new GoogleStyleWordTokenizer();
        this.ngramSizeToWriter.put(1, new FileWriter(new File(outputDir, "unigrams.csv")));
        this.ngramSizeToWriter.put(2, new FileWriter(new File(outputDir, "bigrams.csv")));
        this.ngramSizeToWriter.put(3, new FileWriter(new File(outputDir, "trigrams.csv")));
    }

    @Override
    public void close() throws Exception {
        for (Map.Entry<Integer, FileWriter> entry : this.ngramSizeToWriter.entrySet()) {
            entry.getValue().close();
        }
    }

    private void indexInputFile() throws IOException, CompressorException {
        FileInputStream fin = new FileInputStream(this.input);
        BufferedInputStream in = new BufferedInputStream(fin);
        try (CompressorInputStream input = new CompressorStreamFactory().createCompressorInputStream((InputStream)in);){
            int n;
            byte[] buffer = new byte[8192];
            while ((n = input.read(buffer)) != -1) {
                String buf = new String(buffer, 0, n);
                String[] lines = buf.split("\n");
                this.indexLine(lines);
            }
        }
        this.writeToDisk(1, this.unigramToCount);
        this.writeToDisk(2, this.bigramToCount);
        this.writeToDisk(3, this.trigramToCount);
    }

    private void indexLine(String[] lines) throws IOException {
        for (String line : lines) {
            if (line.length() > 50000) {
                System.out.println("Ignoring long line: " + line.length() + " bytes");
                continue;
            }
            if (this.lineCount++ % 50000L == 0L) {
                float mb = (float)this.charCount / 1000.0f / 1000.0f;
                System.out.printf(Locale.ENGLISH, "Indexing line %d (%.2fMB)\n", this.lineCount, Float.valueOf(mb));
            }
            this.charCount += (long)line.length();
            List sentences = this.sentenceTokenizer.tokenize(line);
            for (String sentence : sentences) {
                this.indexSentence(sentence);
            }
        }
    }

    private void indexSentence(String sentence) throws IOException {
        List tokens = this.wordTokenizer.tokenize(sentence);
        tokens.add(0, "_START_");
        tokens.add("_END_");
        String prevPrev = null;
        String prev = null;
        for (String token : tokens) {
            String ngram;
            if (token.trim().isEmpty()) continue;
            if (token.length() <= 20) {
                this.unigramToCount.compute(token, (k, v) -> v == null ? 1L : v + 1L);
            }
            if (prev != null && token.length() <= 20 && prev.length() <= 20) {
                ngram = prev + " " + token;
                this.bigramToCount.compute(ngram, (k, v) -> v == null ? 1L : v + 1L);
            }
            if (prevPrev != null && prev != null) {
                if (token.length() <= 20 && prev.length() <= 20 && prevPrev.length() <= 20) {
                    ngram = prevPrev + " " + prev + " " + token;
                    this.trigramToCount.compute(ngram, (k, v) -> v == null ? 1L : v + 1L);
                }
                if (this.unigramToCount.size() > 1000000) {
                    this.writeToDisk(1, this.unigramToCount);
                }
                if (this.bigramToCount.size() > 1000000) {
                    this.writeToDisk(2, this.bigramToCount);
                }
                if (this.trigramToCount.size() > 1000000) {
                    this.writeToDisk(3, this.trigramToCount);
                }
            }
            prevPrev = prev;
            prev = token;
        }
    }

    private void writeToDisk(int ngramSize, Map<String, Long> ngramToCount) throws IOException {
        System.out.println("Writing " + ngramToCount.size() + " cached ngrams to disk (ngramSize=" + ngramSize + ")...");
        FileWriter writer = this.ngramSizeToWriter.get(ngramSize);
        for (Map.Entry<String, Long> entry : ngramToCount.entrySet()) {
            writer.write(entry.getKey() + "\t" + entry.getValue() + "\n");
        }
        writer.flush();
        ngramToCount.clear();
    }

    public static void main(String[] args) throws Exception {
        if (args.length != 3) {
            System.out.println("Usage: " + CommonCrawlToNgram3.class + " <langCode> <input.xz/bz2> <outputDir>");
            System.exit(1);
        }
        Language language = Languages.getLanguageForShortCode((String)args[0]);
        File input = new File(args[1]);
        File outputDir = new File(args[2]);
        try (CommonCrawlToNgram3 prg = new CommonCrawlToNgram3(language, input, outputDir);){
            prg.indexInputFile();
        }
    }
}

