/*
 * Decompiled with CFR 0.152.
 */
package org.languagetool.tokenizers.uk;

import java.util.ArrayList;
import java.util.List;
import java.util.StringTokenizer;
import java.util.regex.Pattern;
import org.languagetool.tokenizers.Tokenizer;

public class UkrainianWordTokenizer
implements Tokenizer {
    private static final String SPLIT_CHARS = " \u00a0\u115f\u1160\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u200b\u200c\u200d\u200e\u200f\u2028\u2029\u202a\u202b\u202c\u202d\u202e\u202f\u205f\u2060\u2061\u2062\u2063\u206a\u206b\u206c\u206d\u206e\u206f\u3000\u3164\ufeff\uffa0\ufff9\ufffa\ufffb,.;()[]{}<>!?:/|\\\"\u00ab\u00bb\u201e\u201d\u201c`\u00b4\u2018\u201b\u2032\u2026\u00bf\u00a1\t\n\r";
    private static final Pattern DECIMAL_COMMA_PATTERN = Pattern.compile("([\\d]),([\\d])", 66);
    private static final char DECIMAL_COMMA_SUBST = '_';

    public List<String> tokenize(String text) {
        text = UkrainianWordTokenizer.cleanup(text);
        text = DECIMAL_COMMA_PATTERN.matcher(text).replaceAll("$1_$2");
        ArrayList<String> tokenList = new ArrayList<String>();
        StringTokenizer st = new StringTokenizer(text, SPLIT_CHARS, true);
        while (st.hasMoreElements()) {
            String token = st.nextToken();
            token = token.replace('_', ',');
            tokenList.add(token);
        }
        return tokenList;
    }

    private static String cleanup(String text) {
        return text.replace("\u0301", "").replace("\u00ad", "").replace('\u2019', '\'').replace('\u02bc', '\'');
    }
}

