/*
 * Decompiled with CFR 0.152.
 */
package org.deeplearning4j.text.tokenization.tokenizer;

import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.NavigableMap;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.regex.Pattern;
import org.deeplearning4j.text.tokenization.tokenizer.TokenPreProcess;
import org.deeplearning4j.text.tokenization.tokenizer.Tokenizer;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class BertWordPieceTokenizer
implements Tokenizer {
    private static final Logger log = LoggerFactory.getLogger(BertWordPieceTokenizer.class);
    public static final Pattern splitPattern = Pattern.compile("\\p{javaWhitespace}+|((?<=\\p{Punct})+|(?=\\p{Punct}+))");
    private final List<String> tokens;
    private final TokenPreProcess preTokenizePreProcessor;
    private TokenPreProcess tokenPreProcess;
    private final AtomicInteger cursor = new AtomicInteger(0);

    public BertWordPieceTokenizer(String tokens, NavigableMap<String, Integer> vocab, TokenPreProcess preTokenizePreProcessor, TokenPreProcess tokenPreProcess) {
        if (vocab.comparator() == null || vocab.comparator().compare("a", "b") < 0) {
            throw new IllegalArgumentException("Vocab must use reverse sort order!");
        }
        this.preTokenizePreProcessor = preTokenizePreProcessor;
        this.tokenPreProcess = tokenPreProcess;
        this.tokens = this.tokenize(vocab, tokens);
    }

    @Override
    public boolean hasMoreTokens() {
        return this.cursor.get() < this.tokens.size();
    }

    @Override
    public int countTokens() {
        return this.tokens.size();
    }

    @Override
    public String nextToken() {
        String base = this.tokens.get(this.cursor.getAndIncrement());
        if (this.tokenPreProcess != null) {
            base = this.tokenPreProcess.preProcess(base);
        }
        return base;
    }

    @Override
    public List<String> getTokens() {
        if (this.tokenPreProcess != null) {
            ArrayList<String> result = new ArrayList<String>(this.tokens.size());
            for (String token : this.tokens) {
                result.add(this.tokenPreProcess.preProcess(token));
            }
            return result;
        }
        return this.tokens;
    }

    @Override
    public void setTokenPreProcessor(TokenPreProcess tokenPreProcessor) {
        this.tokenPreProcess = tokenPreProcessor;
    }

    private List<String> tokenize(NavigableMap<String, Integer> vocab, String toTokenize) {
        ArrayList<String> output = new ArrayList<String>();
        String fullString = toTokenize;
        if (this.preTokenizePreProcessor != null) {
            fullString = this.preTokenizePreProcessor.preProcess(toTokenize);
        }
        String[] stringArray = splitPattern.split(fullString);
        int n = stringArray.length;
        for (int i = 0; i < n; ++i) {
            String basicToken;
            String candidate = basicToken = stringArray[i];
            int count = 0;
            while (candidate.length() > 0 && !"##".equals(candidate)) {
                String longestSubstring = this.findLongestSubstring(vocab, candidate);
                output.add(longestSubstring);
                candidate = "##" + candidate.substring(longestSubstring.length());
                if (count++ <= basicToken.length()) continue;
                throw new IllegalStateException("Invalid token encountered: \"" + basicToken + "\" likely contains characters that are not present in the vocabulary. Invalid tokens may be cleaned in a preprocessing step using a TokenPreProcessor. preTokenizePreProcessor=" + this.preTokenizePreProcessor + ", tokenPreProcess=" + this.tokenPreProcess);
            }
        }
        return output;
    }

    protected String findLongestSubstring(NavigableMap<String, Integer> vocab, String candidate) {
        NavigableMap<String, Integer> tailMap = vocab.tailMap(candidate, true);
        this.checkIfEmpty(tailMap, candidate);
        String longestSubstring = (String)tailMap.firstKey();
        int subStringLength = Math.min(candidate.length(), longestSubstring.length());
        while (!candidate.startsWith(longestSubstring)) {
            tailMap = tailMap.tailMap(candidate.substring(0, --subStringLength), true);
            this.checkIfEmpty(tailMap, candidate);
            longestSubstring = (String)tailMap.firstKey();
        }
        return longestSubstring;
    }

    protected void checkIfEmpty(Map<String, Integer> m, String candidate) {
        if (m.isEmpty()) {
            throw new IllegalStateException("Invalid token/character encountered: \"" + candidate + "\" likely contains characters that are not present in the vocabulary. Invalid tokens may be cleaned in a preprocessing step using a TokenPreProcessor. preTokenizePreProcessor=" + this.preTokenizePreProcessor + ", tokenPreProcess=" + this.tokenPreProcess);
        }
    }
}

