/*
 * Decompiled with CFR 0.152.
 */
package com.yahoo.vespa.indexinglanguage.linguistics;

import com.yahoo.document.annotation.Annotation;
import com.yahoo.document.annotation.AnnotationTypes;
import com.yahoo.document.annotation.Span;
import com.yahoo.document.annotation.SpanList;
import com.yahoo.document.annotation.SpanTree;
import com.yahoo.document.datatypes.FieldValue;
import com.yahoo.document.datatypes.StringFieldValue;
import com.yahoo.language.Linguistics;
import com.yahoo.language.LinguisticsCase;
import com.yahoo.language.process.StemMode;
import com.yahoo.language.process.Token;
import com.yahoo.language.process.Tokenizer;
import com.yahoo.text.Text;
import com.yahoo.vespa.indexinglanguage.linguistics.AnnotatorConfig;
import java.util.HashMap;
import java.util.Map;

public class LinguisticsAnnotator {
    private final Linguistics factory;
    private final AnnotatorConfig config;

    public LinguisticsAnnotator(Linguistics factory, AnnotatorConfig config) {
        this.factory = factory;
        this.config = config;
    }

    public boolean annotate(StringFieldValue text) {
        if (text.getSpanTree("linguistics") != null) {
            return true;
        }
        Tokenizer tokenizer = this.factory.getTokenizer();
        String input = text.getString().length() <= this.config.getMaxTokenizeLength() ? text.getString() : Text.substringByCodepoints((String)text.getString(), (int)0, (int)this.config.getMaxTokenizeLength());
        Iterable tokens = tokenizer.tokenize(input, this.config.getLanguage(), this.config.getStemMode(), this.config.getRemoveAccents());
        TermOccurrences termOccurrences = new TermOccurrences(this.config.getMaxTermOccurrences());
        SpanTree tree = new SpanTree("linguistics");
        for (Token token : tokens) {
            LinguisticsAnnotator.addAnnotationSpan(text.getString(), tree.spanList(), token, this.config.getStemMode(), termOccurrences);
        }
        if (tree.numAnnotations() == 0) {
            return false;
        }
        text.setSpanTree(tree);
        return true;
    }

    public static Annotation lowerCaseTermAnnotation(String termToLowerCase, String origTerm) {
        String annotationValue = LinguisticsCase.toLowerCase((String)termToLowerCase);
        if (annotationValue.equals(origTerm)) {
            return new Annotation(AnnotationTypes.TERM);
        }
        return new Annotation(AnnotationTypes.TERM, (FieldValue)new StringFieldValue(annotationValue));
    }

    private static void addAnnotation(Span here, String term, String orig, TermOccurrences termOccurrences) {
        if (termOccurrences.termCountBelowLimit(term)) {
            here.annotate(LinguisticsAnnotator.lowerCaseTermAnnotation(term, orig));
        }
    }

    private static void addAnnotationSpan(String input, SpanList parent, Token token, StemMode mode, TermOccurrences termOccurrences) {
        if (!token.isSpecialToken()) {
            if (token.getNumComponents() > 0) {
                for (int i = 0; i < token.getNumComponents(); ++i) {
                    LinguisticsAnnotator.addAnnotationSpan(input, parent, token.getComponent(i), mode, termOccurrences);
                }
                return;
            }
            if (!token.isIndexable()) {
                return;
            }
        }
        if (token.getOffset() >= (long)input.length()) {
            throw new IllegalArgumentException(token + " has offset " + token.getOffset() + ", which is outside the bounds of the input string '" + input + "'");
        }
        if (token.getOffset() + (long)token.getOrig().length() > (long)input.length()) {
            throw new IllegalArgumentException(token + " has offset " + token.getOffset() + ", which makes it overflow the bounds of the input string; " + input);
        }
        if (mode == StemMode.ALL) {
            Span where = parent.span((int)token.getOffset(), token.getOrig().length());
            String lowercasedOrig = LinguisticsCase.toLowerCase((String)token.getOrig());
            LinguisticsAnnotator.addAnnotation(where, token.getOrig(), token.getOrig(), termOccurrences);
            String lowercasedTerm = lowercasedOrig;
            String term = token.getTokenString();
            if (term != null) {
                lowercasedTerm = LinguisticsCase.toLowerCase((String)term);
            }
            if (!lowercasedOrig.equals(lowercasedTerm)) {
                LinguisticsAnnotator.addAnnotation(where, term, token.getOrig(), termOccurrences);
            }
            for (int i = 0; i < token.getNumStems(); ++i) {
                String stem = token.getStem(i);
                String lowercasedStem = LinguisticsCase.toLowerCase((String)stem);
                if (lowercasedOrig.equals(lowercasedStem) || lowercasedTerm.equals(lowercasedStem)) continue;
                LinguisticsAnnotator.addAnnotation(where, stem, token.getOrig(), termOccurrences);
            }
        } else {
            String term = token.getTokenString();
            if (term == null || term.trim().isEmpty()) {
                return;
            }
            if (termOccurrences.termCountBelowLimit(term)) {
                parent.span((int)token.getOffset(), token.getOrig().length()).annotate(LinguisticsAnnotator.lowerCaseTermAnnotation(term, token.getOrig()));
            }
        }
    }

    private static class TermOccurrences {
        final Map<String, Integer> termOccurrences = new HashMap<String, Integer>();
        final int maxOccurrences;

        public TermOccurrences(int maxOccurences) {
            this.maxOccurrences = maxOccurences;
        }

        boolean termCountBelowLimit(String term) {
            String lowerCasedTerm = LinguisticsCase.toLowerCase((String)term);
            int occurrences = this.termOccurrences.getOrDefault(lowerCasedTerm, 0);
            if (occurrences >= this.maxOccurrences) {
                return false;
            }
            this.termOccurrences.put(lowerCasedTerm, occurrences + 1);
            return true;
        }
    }
}

