/*
 * Decompiled with CFR 0.152.
 */
package com.yahoo.vespasignificance;

import com.fasterxml.jackson.core.JsonFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.ObjectWriter;
import com.yahoo.document.DataType;
import com.yahoo.document.Document;
import com.yahoo.document.DocumentPut;
import com.yahoo.document.DocumentType;
import com.yahoo.document.DocumentTypeManager;
import com.yahoo.document.Field;
import com.yahoo.document.datatypes.FieldValue;
import com.yahoo.document.json.DocumentOperationType;
import com.yahoo.document.json.JsonReader;
import com.yahoo.document.json.ParsedDocumentOperation;
import com.yahoo.language.Language;
import com.yahoo.language.opennlp.OpenNlpLinguistics;
import com.yahoo.language.process.StemMode;
import com.yahoo.language.process.Token;
import com.yahoo.language.process.TokenScript;
import com.yahoo.language.process.TokenType;
import com.yahoo.language.process.Tokenizer;
import com.yahoo.language.significance.impl.DocumentFrequencyFile;
import com.yahoo.language.significance.impl.SignificanceModelFile;
import com.yahoo.text.Utf8;
import com.yahoo.vespasignificance.ClientParameters;
import io.airlift.compress.zstd.ZstdInputStream;
import io.airlift.compress.zstd.ZstdOutputStream;
import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.nio.file.Files;
import java.nio.file.OpenOption;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.stream.Collectors;
import java.util.stream.StreamSupport;

public class SignificanceModelGenerator {
    private final ClientParameters clientParameters;
    private final Tokenizer tokenizer;
    private final TreeMap<String, Long> documentFrequency = new TreeMap();
    private final List<Language> languages;
    private final Language languageTag;
    private final ObjectMapper objectMapper;
    private static final JsonFactory parserFactory = new JsonFactory();
    final DocumentTypeManager types = new DocumentTypeManager();
    final DocumentType docType;
    private final boolean useZstCompression;
    private static final String VERSION = "1.0";
    private static final String ID = "1";
    private static final String SIGNIFICANCE_DESCRIPTION = "Significance model for input file";
    private static final String DOC_FREQ_DESCRIPTION = "Document frequency for language";

    public SignificanceModelGenerator(ClientParameters clientParameters) {
        this.clientParameters = clientParameters;
        if (clientParameters.zstCompression && !clientParameters.outputFile.endsWith(".zst")) {
            throw new IllegalArgumentException("Output file must have .zst extension when using zst compression");
        }
        if (!clientParameters.zstCompression && clientParameters.outputFile.endsWith(".zst")) {
            throw new IllegalArgumentException("Output file must not have .zst extension when not using zst compression");
        }
        this.languages = Arrays.stream(clientParameters.language.split(",")).map(Language::fromLanguageTag).collect(Collectors.toList());
        this.languageTag = this.languages.get(0);
        OpenNlpLinguistics openNlpLinguistics = new OpenNlpLinguistics();
        this.tokenizer = openNlpLinguistics.getTokenizer();
        this.objectMapper = new ObjectMapper();
        this.docType = new DocumentType(clientParameters.docType);
        this.docType.addField(new Field(clientParameters.field, (DataType)DataType.STRING));
        this.useZstCompression = clientParameters.zstCompression;
        this.types.registerDocumentType(this.docType);
    }

    public void generate() throws IOException {
        SignificanceModelFile modelFile;
        Path currentWorkingDir = Paths.get("", new String[0]).toAbsolutePath();
        InputStream rawDoc = Files.newInputStream(currentWorkingDir.resolve(this.clientParameters.inputFile), new OpenOption[0]);
        BufferedReader reader = new BufferedReader(new InputStreamReader(rawDoc));
        long i = 1L;
        while (reader.ready()) {
            String line = reader.readLine();
            JsonReader jsonReader = new JsonReader(this.types, (InputStream)new ByteArrayInputStream(Utf8.toBytes((String)line)), parserFactory);
            String wikimediaId = "id:wikimedia:" + this.languageTag.languageCode() + "::" + i;
            ParsedDocumentOperation operation = jsonReader.readSingleDocumentStreaming(DocumentOperationType.PUT, wikimediaId);
            DocumentPut put = (DocumentPut)operation.operation();
            Document document = put.getDocument();
            FieldValue fieldValue = document.getFieldValue(this.clientParameters.field);
            this.handleTokenization(fieldValue.toString());
            ++i;
        }
        final long pageCount = i - 1L;
        File outputFile = Paths.get(this.clientParameters.outputFile, new String[0]).toFile();
        final String languagesKey = String.join((CharSequence)",", this.languages.stream().map(Language::languageCode).toList());
        if (outputFile.exists()) {
            Object in = outputFile.toString().endsWith(".zst") ? new ZstdInputStream((InputStream)new FileInputStream(outputFile)) : new FileInputStream(outputFile);
            modelFile = (SignificanceModelFile)this.objectMapper.readValue((InputStream)in, SignificanceModelFile.class);
            modelFile.addLanguage(languagesKey, new DocumentFrequencyFile(DOC_FREQ_DESCRIPTION, pageCount, this.getFinalDocumentFrequency()));
        } else {
            HashMap<String, DocumentFrequencyFile> languages = new HashMap<String, DocumentFrequencyFile>(){
                {
                    this.put(languagesKey, new DocumentFrequencyFile(SignificanceModelGenerator.DOC_FREQ_DESCRIPTION, pageCount, SignificanceModelGenerator.this.getFinalDocumentFrequency()));
                }
            };
            modelFile = new SignificanceModelFile(VERSION, ID, SIGNIFICANCE_DESCRIPTION + this.clientParameters.inputFile, (HashMap)languages);
        }
        try {
            ObjectWriter writer = this.objectMapper.writerWithDefaultPrettyPrinter();
            Object out = this.useZstCompression ? new ZstdOutputStream((OutputStream)new FileOutputStream(this.clientParameters.outputFile)) : new FileOutputStream(this.clientParameters.outputFile);
            writer.writeValue((OutputStream)out, (Object)modelFile);
        }
        catch (IOException e) {
            throw new IllegalStateException("Failed to write model to output file", e);
        }
    }

    private void handleTokenization(String field) {
        Iterable tokens = this.tokenizer.tokenize(field, this.languageTag, StemMode.ALL, false);
        Set uniqueWords = StreamSupport.stream(tokens.spliterator(), false).filter(t -> t.getType() == TokenType.ALPHABETIC).filter(t -> t.getScript() == TokenScript.LATIN).map(Token::getTokenString).collect(Collectors.toSet());
        for (String word : uniqueWords) {
            if (this.documentFrequency.containsKey(word)) {
                this.documentFrequency.merge(word, 1L, Long::sum);
                continue;
            }
            this.documentFrequency.put(word, 1L);
        }
    }

    public Map<String, Long> getFinalDocumentFrequency() {
        return this.documentFrequency.entrySet().stream().filter(k -> (Long)k.getValue() > 1L).collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue, (e1, e2) -> e1, TreeMap::new));
    }
}

