/*
 * Decompiled with CFR 0.152.
 */
package org.deeplearning4j.text.tokenization.tokenizer;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.util.ArrayList;
import java.util.List;
import java.util.NavigableMap;
import java.util.concurrent.atomic.AtomicInteger;
import org.deeplearning4j.text.tokenization.tokenizer.BertWordPieceTokenizer;
import org.deeplearning4j.text.tokenization.tokenizer.TokenPreProcess;
import org.deeplearning4j.text.tokenization.tokenizer.Tokenizer;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class BertWordPieceStreamTokenizer
implements Tokenizer {
    private static final Logger log = LoggerFactory.getLogger(BertWordPieceStreamTokenizer.class);
    private final NavigableMap<String, Integer> vocab;
    private final Reader reader;
    private final boolean lowerCaseOnly;
    private boolean more = true;
    private String buffer = "";
    private int longestToken = 0;
    private String prevRest = null;
    private boolean noSplit = false;
    private TokenPreProcess tokenPreProcess;
    private List<String> tokens = new ArrayList<String>();
    private AtomicInteger position = new AtomicInteger(0);

    public BertWordPieceStreamTokenizer(InputStream is, NavigableMap<String, Integer> vocab, boolean lowerCaseOnly) {
        this.lowerCaseOnly = lowerCaseOnly;
        if (vocab.comparator() == null || vocab.comparator().compare("a", "b") < 0) {
            throw new IllegalArgumentException("Vocab must use reverse sort order!");
        }
        this.reader = new BufferedReader(new InputStreamReader(is));
        this.vocab = vocab;
        for (String token : vocab.keySet()) {
            if (token.length() <= this.longestToken) continue;
            this.longestToken = token.length();
        }
    }

    @Override
    public boolean hasMoreTokens() {
        return this.more || this.buffer.length() > 0 || this.prevRest != null;
    }

    private void readMore() {
        StringBuilder builder = new StringBuilder(this.longestToken);
        while (this.more && builder.length() < this.longestToken) {
            try {
                int codePoint = this.reader.read();
                if (codePoint >= 0) {
                    builder.appendCodePoint(codePoint);
                    continue;
                }
                this.more = false;
            }
            catch (IOException e) {
                this.more = false;
                log.error("Unexpected exception while reading input stream", (Throwable)e);
            }
        }
        String input = builder.toString();
        if (this.lowerCaseOnly) {
            input = input.toLowerCase();
        }
        if (this.noSplit) {
            String[] parts = BertWordPieceTokenizer.splitPattern.split(input, 2);
            this.prevRest = (this.prevRest == null ? "" : this.prevRest) + parts[0];
            if (parts.length > 1) {
                this.noSplit = false;
                this.buffer = this.buffer + parts[1];
            }
        } else {
            this.buffer = this.buffer + input;
        }
    }

    @Override
    public int countTokens() {
        return this.getTokens().size();
    }

    @Override
    public String nextToken() {
        if (!this.tokens.isEmpty() && this.position.get() < this.tokens.size()) {
            return this.tokens.get(this.position.getAndIncrement());
        }
        return this.nextTokenFromStream();
    }

    private String nextTokenFromStream() {
        String basicToken;
        if (this.noSplit && this.more) {
            this.readMore();
        }
        if ((basicToken = this.prevRest) == null || basicToken.length() == 0) {
            if (this.buffer.length() < this.longestToken && this.more) {
                this.readMore();
            }
            String[] parts = BertWordPieceTokenizer.splitPattern.split(this.buffer, 2);
            basicToken = parts[0];
            if (parts.length > 1) {
                this.buffer = parts[1];
                this.noSplit = false;
            } else {
                this.buffer = "";
                this.noSplit = true;
            }
        }
        String output = BertWordPieceTokenizer.findLongestSubstring(this.vocab, basicToken);
        String tokenRest = basicToken.substring(output.length());
        if (basicToken.length() > output.length()) {
            tokenRest = "##" + tokenRest;
        }
        if ("##".equals(tokenRest) || tokenRest.length() == 0) {
            tokenRest = null;
        }
        this.prevRest = tokenRest;
        if (this.tokenPreProcess != null) {
            output = this.tokenPreProcess.preProcess(output);
        }
        return output;
    }

    @Override
    public List<String> getTokens() {
        if (!this.tokens.isEmpty()) {
            return this.tokens;
        }
        log.info("Starting prebuffering...");
        while (this.hasMoreTokens()) {
            this.tokens.add(this.nextTokenFromStream());
        }
        log.info("Tokens prefetch finished. Tokens size: [" + this.tokens.size() + "]");
        return this.tokens;
    }

    @Override
    public void setTokenPreProcessor(TokenPreProcess tokenPreProcessor) {
        this.tokenPreProcess = tokenPreProcessor;
    }
}

