/*
 * Decompiled with CFR 0.152.
 */
package ai.vespa.vespasignificance.generate;

import ai.vespa.vespasignificance.generate.FormatStrategy;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.yahoo.language.Language;
import com.yahoo.language.process.LinguisticsParameters;
import com.yahoo.language.process.StemMode;
import com.yahoo.language.process.Token;
import com.yahoo.language.process.TokenScript;
import com.yahoo.language.process.TokenType;
import com.yahoo.language.process.Tokenizer;
import java.io.BufferedReader;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.SortedMap;
import java.util.TreeMap;

public final class JsonlDocumentFormatStrategy
implements FormatStrategy {
    private final Path input;
    private final String field;
    private final Tokenizer tokenizer;
    private final Language tokenizationLanguage;
    private final List<Language> languageKeyParts;
    private final ObjectMapper mapper = new ObjectMapper();

    public JsonlDocumentFormatStrategy(Path input, Tokenizer tokenizer, Language tokenizationLanguage, List<Language> languageKeyParts, String field) {
        this.input = input;
        this.field = field;
        this.tokenizer = tokenizer;
        this.tokenizationLanguage = tokenizationLanguage;
        this.languageKeyParts = languageKeyParts;
    }

    @Override
    public FormatStrategy.Result build() throws IOException {
        TreeMap<String, Long> df = new TreeMap<String, Long>();
        long docs = 0L;
        try (BufferedReader br = Files.newBufferedReader(this.input);){
            String line;
            while ((line = br.readLine()) != null) {
                JsonNode value;
                if (line.isBlank()) {
                    ++docs;
                    continue;
                }
                JsonNode root = this.mapper.readTree(line);
                JsonNode fields = root.get("fields");
                if (fields != null && (value = fields.get(this.field)) != null && !value.isNull()) {
                    String text = value.isTextual() ? value.asText() : value.toString();
                    this.tokenizeAndAccumulate(df, text);
                }
                if (++docs % 50000L != 0L) continue;
                System.out.println("Documents processed: " + docs + ", unique terms: " + df.size());
            }
        }
        return new FormatStrategy.Result(Collections.unmodifiableSortedMap(new TreeMap(df)), docs);
    }

    private void tokenizeAndAccumulate(SortedMap<String, Long> df, String text) {
        LinguisticsParameters params = new LinguisticsParameters(this.tokenizationLanguage, StemMode.NONE, false, true);
        Iterable tokens = this.tokenizer.tokenize(text, params);
        HashSet<String> unique = new HashSet<String>();
        for (Token t : tokens) {
            if (t.getType() != TokenType.ALPHABETIC || t.getScript() != TokenScript.LATIN) continue;
            unique.add(t.getTokenString());
        }
        for (String term : unique) {
            df.merge(term, 1L, Long::sum);
        }
    }

    @Override
    public String languageKey() {
        return String.join((CharSequence)",", this.languageKeyParts.stream().map(Language::languageCode).toList());
    }
}

