/*
 * Decompiled with CFR 0.152.
 */
package ai.djl.modality.nlp.bert;

import ai.djl.modality.nlp.NlpUtils;
import ai.djl.modality.nlp.SimpleVocabulary;
import ai.djl.modality.nlp.bert.WordpieceTokenizer;
import ai.djl.modality.nlp.preprocess.LambdaProcessor;
import ai.djl.modality.nlp.preprocess.LowerCaseConvertor;
import ai.djl.modality.nlp.preprocess.PunctuationSeparator;
import ai.djl.modality.nlp.preprocess.SimpleTokenizer;
import ai.djl.modality.nlp.preprocess.TextCleaner;
import ai.djl.modality.nlp.preprocess.TextProcessor;
import ai.djl.modality.nlp.preprocess.UnicodeNormalizer;
import java.io.BufferedReader;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.text.Normalizer;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class BertFullTokenizer
extends SimpleTokenizer {
    private static final Logger logger = LoggerFactory.getLogger(BertFullTokenizer.class);
    private SimpleVocabulary vocabulary;
    private List<TextProcessor> basicBertPreprocessors;
    private WordpieceTokenizer wordpieceTokenizer;

    public BertFullTokenizer(String filepath, boolean lowerCase) {
        this.parse(filepath);
        this.basicBertPreprocessors = BertFullTokenizer.getPreprocessors(lowerCase);
        this.wordpieceTokenizer = new WordpieceTokenizer(this.vocabulary, "[UNK]", 200);
    }

    public SimpleVocabulary getVocabulary() {
        return this.vocabulary;
    }

    @Override
    public List<String> tokenize(String input) {
        List<String> tokens = new ArrayList<String>(Collections.singletonList(input));
        for (TextProcessor processor : this.basicBertPreprocessors) {
            tokens = processor.preprocess(tokens);
        }
        return this.wordpieceTokenizer.preprocess(tokens);
    }

    public static List<TextProcessor> getPreprocessors(boolean lowerCase) {
        ArrayList<TextProcessor> processors = new ArrayList<TextProcessor>(10);
        processors.add(new TextCleaner(c -> c.charValue() == '\u0000' || c.charValue() == '\ufffd' || NlpUtils.isControl(c.charValue()), '\u0000'));
        processors.add(new TextCleaner(NlpUtils::isWhiteSpace, ' '));
        processors.add(new LambdaProcessor(String::trim));
        processors.add(new SimpleTokenizer());
        if (lowerCase) {
            processors.add(new LowerCaseConvertor());
        }
        processors.add(new UnicodeNormalizer(Normalizer.Form.NFD));
        processors.add(new TextCleaner(c -> Character.getType(c.charValue()) == 6, '\u0000'));
        processors.add(new PunctuationSeparator());
        processors.add(new LambdaProcessor(String::trim));
        return processors;
    }

    private void parse(String path) {
        ArrayList<String> tokens = new ArrayList<String>();
        try (BufferedReader reader = Files.newBufferedReader(Paths.get(path, new String[0]));){
            String token;
            while ((token = reader.readLine()) != null) {
                if ((token = token.trim()).isEmpty()) continue;
                tokens.add(token);
            }
        }
        catch (IOException e) {
            logger.error("Failed read token file", (Throwable)e);
        }
        this.vocabulary = new SimpleVocabulary.VocabularyBuilder().optMinFrequency(1).add(tokens).optUnknownToken("[UNK]").build();
    }
}

