/*
 * Decompiled with CFR 0.152.
 */
package com.robrua.nlp.bert;

import com.google.common.collect.ImmutableSet;
import com.robrua.nlp.bert.Tokenizer;
import java.text.Normalizer;
import java.util.Arrays;
import java.util.Set;
import java.util.stream.Stream;

public class BasicTokenizer
extends Tokenizer {
    private static final Set<Integer> CONTROL_CATEGORIES = ImmutableSet.of((Object)15, (Object)16, (Object)18, (Object)19, (Object)0);
    private static final Set<Integer> PUNCTUATION_CATEGORIES = ImmutableSet.of((Object)23, (Object)20, (Object)22, (Object)30, (Object)29, (Object)24, (Object[])new Integer[]{21});
    private static final Set<Integer> SAFE_CONTROL_CHARACTERS = ImmutableSet.of((Object)9, (Object)10, (Object)13);
    private static final Set<Integer> STRIP_CHARACTERS = ImmutableSet.of((Object)0, (Object)65533);
    private static final Set<Integer> WHITESPACE_CHARACTERS = ImmutableSet.of((Object)32, (Object)9, (Object)10, (Object)13);
    private final boolean doLowerCase;

    private static String cleanText(String sequence) {
        StringBuilder builder = new StringBuilder();
        sequence.codePoints().filter(codePoint -> !STRIP_CHARACTERS.contains(codePoint) && !BasicTokenizer.isControl(codePoint)).map(codePoint -> BasicTokenizer.isWhitespace(codePoint) ? 32 : codePoint).forEachOrdered(codePoint -> builder.append(Character.toChars(codePoint)));
        return builder.toString();
    }

    private static boolean isChineseCharacter(int codePoint) {
        return codePoint >= 19968 && codePoint <= 40959 || codePoint >= 13312 && codePoint <= 19903 || codePoint >= 131072 && codePoint <= 173791 || codePoint >= 173824 && codePoint <= 177983 || codePoint >= 177984 && codePoint <= 178207 || codePoint >= 178208 && codePoint <= 183983 || codePoint >= 63744 && codePoint <= 64255 || codePoint >= 194560 && codePoint <= 195103;
    }

    private static boolean isControl(int codePoint) {
        return !SAFE_CONTROL_CHARACTERS.contains(codePoint) && CONTROL_CATEGORIES.contains(Character.getType(codePoint));
    }

    private static boolean isPunctuation(int codePoint) {
        return codePoint >= 33 && codePoint <= 47 || codePoint >= 58 && codePoint <= 64 || codePoint >= 91 && codePoint <= 96 || codePoint >= 123 && codePoint <= 126 || PUNCTUATION_CATEGORIES.contains(Character.getType(codePoint));
    }

    private static boolean isWhitespace(int codePoint) {
        return WHITESPACE_CHARACTERS.contains(codePoint) || 12 == Character.getType(codePoint);
    }

    private static Stream<String> splitOnPunctuation(String token) {
        Stream.Builder<String> stream = Stream.builder();
        StringBuilder builder = new StringBuilder();
        token.codePoints().forEachOrdered(codePoint -> {
            if (BasicTokenizer.isPunctuation(codePoint)) {
                stream.accept(builder.toString());
                builder.setLength(0);
                stream.accept(String.valueOf(Character.toChars(codePoint)));
            } else {
                builder.append(Character.toChars(codePoint));
            }
        });
        if (builder.length() > 0) {
            stream.accept(builder.toString());
        }
        return stream.build();
    }

    private static String stripAccents(String token) {
        StringBuilder builder = new StringBuilder();
        Normalizer.normalize(token, Normalizer.Form.NFD).codePoints().filter(codePoint -> 6 != Character.getType(codePoint)).forEachOrdered(codePoint -> builder.append(Character.toChars(codePoint)));
        return builder.toString();
    }

    private static String tokenizeChineseCharacters(String sequence) {
        StringBuilder builder = new StringBuilder();
        sequence.codePoints().forEachOrdered(codePoint -> {
            if (BasicTokenizer.isChineseCharacter(codePoint)) {
                builder.append(' ');
                builder.append(Character.toChars(codePoint));
                builder.append(' ');
            } else {
                builder.append(Character.toChars(codePoint));
            }
        });
        return builder.toString();
    }

    public BasicTokenizer(boolean doLowerCase) {
        this.doLowerCase = doLowerCase;
    }

    private String stripAndSplit(String token) {
        if (this.doLowerCase) {
            token = BasicTokenizer.stripAccents(token.toLowerCase());
        }
        return String.join((CharSequence)" ", (CharSequence[])BasicTokenizer.splitOnPunctuation(token).toArray(String[]::new));
    }

    @Override
    public String[][] tokenize(String ... sequences) {
        return (String[][])Arrays.stream(sequences).map(BasicTokenizer::cleanText).map(BasicTokenizer::tokenizeChineseCharacters).map(sequence -> (String[])BasicTokenizer.whitespaceTokenize(sequence).toArray(String[]::new)).map(tokens -> (String[])Arrays.stream(tokens).map(this::stripAndSplit).flatMap(Tokenizer::whitespaceTokenize).toArray(String[]::new)).toArray(x$0 -> new String[x$0][]);
    }

    @Override
    public String[] tokenize(String sequence) {
        return (String[])BasicTokenizer.whitespaceTokenize(BasicTokenizer.tokenizeChineseCharacters(BasicTokenizer.cleanText(sequence))).map(this::stripAndSplit).flatMap(Tokenizer::whitespaceTokenize).toArray(String[]::new);
    }
}

