/*
 * Decompiled with CFR 0.152.
 */
package ai.djl.modality.nlp.bert;

import ai.djl.modality.nlp.NlpUtils;
import ai.djl.modality.nlp.Vocabulary;
import ai.djl.modality.nlp.bert.BertTokenizer;
import ai.djl.modality.nlp.bert.WordpieceTokenizer;
import ai.djl.modality.nlp.preprocess.LambdaProcessor;
import ai.djl.modality.nlp.preprocess.LowerCaseConvertor;
import ai.djl.modality.nlp.preprocess.PunctuationSeparator;
import ai.djl.modality.nlp.preprocess.SimpleTokenizer;
import ai.djl.modality.nlp.preprocess.TextCleaner;
import ai.djl.modality.nlp.preprocess.TextProcessor;
import ai.djl.modality.nlp.preprocess.UnicodeNormalizer;
import java.text.Normalizer;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;

public class BertFullTokenizer
extends BertTokenizer {
    private Vocabulary vocabulary;
    private List<TextProcessor> basicBertPreprocessors;
    private WordpieceTokenizer wordpieceTokenizer;

    public BertFullTokenizer(Vocabulary vocabulary, boolean lowerCase) {
        this.vocabulary = vocabulary;
        this.basicBertPreprocessors = BertFullTokenizer.getPreprocessors(lowerCase);
        this.wordpieceTokenizer = new WordpieceTokenizer(vocabulary, "[UNK]", 200);
    }

    public Vocabulary getVocabulary() {
        return this.vocabulary;
    }

    @Override
    public List<String> tokenize(String input) {
        List<String> tokens = new ArrayList<String>(Collections.singletonList(input));
        for (TextProcessor processor : this.basicBertPreprocessors) {
            tokens = processor.preprocess(tokens);
        }
        return this.wordpieceTokenizer.preprocess(tokens);
    }

    @Override
    public String tokenToString(List<String> tokens) {
        return String.join((CharSequence)" ", tokens).replace(" ##", "").trim();
    }

    public static List<TextProcessor> getPreprocessors(boolean lowerCase) {
        ArrayList<TextProcessor> processors = new ArrayList<TextProcessor>(10);
        processors.add(new TextCleaner(c -> c.charValue() == '\u0000' || c.charValue() == '\ufffd' || NlpUtils.isControl(c.charValue()), '\u0000'));
        processors.add(new TextCleaner(NlpUtils::isWhiteSpace, ' '));
        processors.add(new LambdaProcessor(String::trim));
        processors.add(new SimpleTokenizer());
        if (lowerCase) {
            processors.add(new LowerCaseConvertor());
        }
        processors.add(new UnicodeNormalizer(Normalizer.Form.NFD));
        processors.add(new TextCleaner(c -> Character.getType(c.charValue()) == 6, '\u0000'));
        processors.add(new PunctuationSeparator());
        processors.add(new LambdaProcessor(String::trim));
        return processors;
    }
}

