/*
 * Decompiled with CFR 0.152.
 */
package org.deeplearning4j.text.tokenization.tokenizer.preprocessor;

import it.unimi.dsi.fastutil.ints.IntOpenHashSet;
import it.unimi.dsi.fastutil.ints.IntSet;
import java.text.Normalizer;
import java.util.List;
import java.util.Map;
import org.deeplearning4j.text.tokenization.tokenizer.TokenPreProcess;

public class BertWordPiecePreProcessor
implements TokenPreProcess {
    public static final char REPLACEMENT_CHAR = '\ufffd';
    protected final boolean lowerCase;
    protected final boolean stripAccents;
    protected final IntSet charSet;

    public BertWordPiecePreProcessor() {
        this(false, false, null);
    }

    public BertWordPiecePreProcessor(boolean lowerCase, boolean stripAccents, Map<String, Integer> vocab) {
        this.lowerCase = lowerCase;
        this.stripAccents = stripAccents;
        if (vocab != null) {
            this.charSet = new IntOpenHashSet();
            for (String s : vocab.keySet()) {
                int cpNum = 0;
                int n = s.codePointCount(0, s.length());
                int charOffset = 0;
                while (cpNum++ < n) {
                    int cp = s.codePointAt(charOffset);
                    charOffset += Character.charCount(cp);
                    this.charSet.add(cp);
                }
            }
        } else {
            this.charSet = null;
        }
    }

    @Override
    public String preProcess(String token) {
        if (this.stripAccents) {
            token = Normalizer.normalize(token, Normalizer.Form.NFD);
        }
        int n = token.codePointCount(0, token.length());
        StringBuilder sb = new StringBuilder();
        int charOffset = 0;
        int cps = 0;
        while (cps++ < n) {
            int cp = token.codePointAt(charOffset);
            charOffset += Character.charCount(cp);
            if (cp == 0 || cp == 65533 || BertWordPiecePreProcessor.isControlCharacter(cp) || this.stripAccents && Character.getType(cp) == 6) continue;
            if (this.lowerCase) {
                cp = Character.toLowerCase(cp);
            }
            if (BertWordPiecePreProcessor.isWhiteSpace(cp)) {
                sb.append(' ');
                continue;
            }
            if (this.charSet != null && !this.charSet.contains(cp)) continue;
            if (BertWordPiecePreProcessor.isChineseCharacter(cp)) {
                sb.append(' ');
                sb.appendCodePoint(cp);
                sb.append(' ');
                continue;
            }
            sb.appendCodePoint(cp);
        }
        return sb.toString();
    }

    public static boolean isControlCharacter(int cp) {
        if (cp == 9 || cp == 10 || cp == 13) {
            return false;
        }
        int type = Character.getType(cp);
        return type == 15 || type == 16;
    }

    public static boolean isWhiteSpace(int cp) {
        if (cp == 9 || cp == 10 || cp == 13) {
            return true;
        }
        int type = Character.getType(cp);
        return type == 12;
    }

    public static boolean isChineseCharacter(int cp) {
        return cp >= 19968 && cp <= 40959 || cp >= 13312 && cp <= 19903 || cp >= 131072 && cp <= 173791 || cp >= 173824 && cp <= 177983 || cp >= 177984 && cp <= 178207 || cp >= 178208 && cp <= 183983 || cp >= 63744 && cp <= 64255 || cp >= 194560 && cp <= 195103;
    }

    public static String reconstructFromTokens(List<String> tokens) {
        StringBuilder sb = new StringBuilder();
        boolean first = true;
        for (String s : tokens) {
            if (s.startsWith("##")) {
                sb.append(s.substring(2));
                continue;
            }
            if (!first && !".".equals(s)) {
                sb.append(" ");
            }
            sb.append(s);
            first = false;
        }
        return sb.toString();
    }
}

