/*
 * Decompiled with CFR 0.152.
 */
package org.languagetool.tokenizers.uk;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.StringTokenizer;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.languagetool.tokenizers.Tokenizer;

public class UkrainianWordTokenizer
implements Tokenizer {
    private static final String SPLIT_CHARS = " \u00a0\u115f\u1160\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u200b\u200c\u200d\u200e\u200f\u2028\u2029\u202a\u202b\u202c\u202d\u202e\u202f\u205f\u2060\u2061\u2062\u2063\u206a\u206b\u206c\u206d\u206e\u206f\u3000\u3164\ufeff\uffa0\ufff9\ufffa\ufffb,.;()[]{}<>!?:/|\\\"\u00ab\u00bb\u201e\u201d\u201c`\u00b4\u2018\u201b\u2032\u2026\u00bf\u00a1\t\n\r\ue100\ue101\ue102\ue110";
    private static final Pattern DECIMAL_COMMA_PATTERN = Pattern.compile("([\\d]),([\\d])", 66);
    private static final char DECIMAL_COMMA_SUBST = '\ue001';
    private static final Pattern DOTTED_NUMBERS_PATTERN = Pattern.compile("([\\d])\\.([\\d])", 66);
    private static final char NUMBER_DOT_SUBST = '\ue002';
    private static final Pattern COLON_NUMBERS_PATTERN = Pattern.compile("([\\d]):([\\d])", 66);
    private static final char COLON_DOT_SUBST = '\ue003';
    private static final Pattern DATE_PATTERN = Pattern.compile("([\\d]{2})\\.([\\d]{2})\\.([\\d]{4})|([\\d]{4})\\.([\\d]{2})\\.([\\d]{2})|([\\d]{4})-([\\d]{2})-([\\d]{2})", 66);
    private static final char DATE_DOT_SUBST = '\ue004';
    private static final Pattern BRACE_IN_WORD_PATTERN = Pattern.compile("([\u0430-\u044f\u0456\u0457\u0454\u0491'])\\(([\u0430-\u044f\u0456\u0457\u0454\u0491']+)\\)", 66);
    private static final char LEFT_BRACE_SUBST = '\ue005';
    private static final char RIGHT_BRACE_SUBST = '\ue006';
    private static final Pattern ABBR_DOT_PATTERN = Pattern.compile("(\u0442\u0438\u0441)\\.([ \u00a0]+[\u0430-\u044f\u0456\u0457\u0454\u0491])");
    private static final Pattern ABBR_DOT_PATTERN1 = Pattern.compile("([^\u0430-\u044f\u0456\u0457\u0454\u0491\u0410-\u042f\u0406\u0407\u0404\u0490'-]\u043b\u0430\u0442)\\.([ \u00a0]+[a-zA-Z])");
    private static final Pattern ABBR_DOT_PATTERN2 = Pattern.compile("([\u0410\u0430]\u043a\u0430\u0434|[\u041f\u043f]\u0440\u043e\u0444|[\u0414\u0434]\u043e\u0446|[\u0410\u0430]\u0441\u0438\u0441\u0442|\u0432\u0443\u043b|\u043e|\u0440|\u0456\u043c)\\.([\\s\u00a0]+[\u0410-\u042f\u0406\u0407\u0404\u0490])");
    private static final Pattern ABBR_DOT_PATTERN5 = Pattern.compile("((?:[0-9]|\u043a\u0432\\.?|\u043a\u0443\u0431\\.?)[\\s\u00a0]+[\u0441\u043c])\\.");
    private static final Pattern ABBR_DOT_PATTERN3 = Pattern.compile("(\u0441)\\.(-\u0433)\\.");
    private static final Pattern ABBR_DOT_PATTERN4 = Pattern.compile("([^\u0430-\u044f\u0456\u0457\u0454\u0491'-][\u0432\u0435\u043a\u043d\u043f\u0440\u0441\u0442\u0446\u0447]{1,2})\\.([\u0435\u043a\u043c\u043d\u043f\u0440\u0441\u0442\u0447]{1,2})\\.");
    private static final Pattern ABBR_DOT_PATTERN6 = Pattern.compile("([^\u0430-\u044f\u0456\u0457\u0454\u0491\u0410-\u042f\u0406\u0407\u0404\u0490'-](?:\u0430\u043c\u0435\u0440|\u0430\u043d\u0433\u043b|\u0431\u043b(?:\u0438\u0437\u044c\u043a)?|\u0432\u0456\u0440\u043c|\u0433\u0440\u0435\u0446(?:\u044c\u043a)|\u0434\u0438\u0432|\u0434\u043e\u043b|\u0434\u043e\u0441\u043b|\u0434\u043e\u0446|\u0435\u043b|\u0436\u0456\u043d|\u0437\u0430\u0441\u0442|\u0437\u0432|\u0456\u043c|\u0456\u0432\u0440|\u0456\u0441\u043f|\u0456\u0442\u0430\u043b|\u043a|\u043a\u0432|[1-9]-\u043a\u0456\u043c\u043d|\u043a\u0456\u043c\u043d|\u043a\u043b|\u043a\u043e\u043f|\u043c|\u043d|\u043d\u0430\u043f\u0440|\u043f|\u043f\u0435\u043d|\u043f\u0435\u0440\u0435\u043a\u043b|\u043f\u043b|\u043f\u043e\u0440|\u043f\u043e\u0447|\u043f\u0440\u0438\u0431\u043b|\u043f\u0440\u043e\u0432|\u043f\u0440\u043e\u0441\u043f|[\u0420\u0440]\u0435\u0434|[\u0420\u0440]\u0435\u0436|\u0440\u0442|\u0441|[\u0421\u0441]\u0432|\u0441\u043e\u0446|\u0441\u043f\u0456\u0432\u0430\u0432\u0442|\u0441\u0442\u043e\u0440|\u0442\u0430\u0431\u043b|\u0442\u0435\u043b|\u0443\u043a\u0440|\u0444\u0456\u043b\u043e\u043b|\u0444\u0440|\u0444\u0440\u0430\u043d\u0446|\u0447|\u0447\u0430\u0439\u043d|\u0446))\\.(?!$)");
    private static final Pattern ABBR_DOT_PATTERN6_2 = Pattern.compile("([^\u0430-\u044f\u0456\u0457\u0454\u0491\u0410-\u042f\u0406\u0407\u0404\u0490'-]((\u0442\u0430|\u0439) \u0456\u043d|\u0435|\u043e\u0431\u043b|\u0440|\u0440\u0440|\u0440\u0443\u0431|\u0441\u0442|\u0441\u0442\u043e\u043b|\u0441\u0442\u043e\u0440|\u0447\u043e\u043b|\u0448\u0442))\\.");
    private static final Pattern ABBR_DOT_PATTERN7 = Pattern.compile("([\u0456\u0439][ \u00a0]+\u0442)\\.([ \u00a0]*(\u0434|\u043f|\u0456\u043d))\\.");
    private static final Pattern ABBR_DOT_PATTERN8 = Pattern.compile("([\\s\u00a0]+(?:[\u0420\u0440]\u0435\u0434|[\u0410\u0430]\u0432\u0442))\\.([\\)\\]])");
    private static final char ABBR_DOT_SUBST = '\ue007';
    private static final String BREAKING_PLACEHOLDER = "\ue110";
    private static final String ELLIPSIS = "...";
    private static final String ELLIPSIS_SUBST = "\ue100";
    private static final String ELLIPSIS2 = "!..";
    private static final String ELLIPSIS2_SUBST = "\ue101";
    private static final String ELLIPSIS3 = "?..";
    private static final String ELLIPSIS3_SUBST = "\ue102";
    private static final Pattern URL_PATTERN = Pattern.compile("^(https?|ftp)://[^\\s/$.?#].[^\\s]*$", 2);
    private static final int URL_START_REPLACE_CHAR = 58112;

    public List<String> tokenize(String text) {
        HashMap<String, String> urls = new HashMap<String, String>();
        if ((text = UkrainianWordTokenizer.cleanup(text)).contains(",")) {
            text = DECIMAL_COMMA_PATTERN.matcher(text).replaceAll("$1\ue001$2");
        }
        if (text.contains("tp")) {
            Matcher matcher = URL_PATTERN.matcher(text);
            int urlReplaceChar = 58112;
            while (matcher.find()) {
                String urlGroup = matcher.group();
                String replaceChar = String.valueOf((char)urlReplaceChar);
                urls.put(replaceChar, urlGroup);
                text = matcher.replaceAll(replaceChar);
                ++urlReplaceChar;
            }
        }
        if (text.contains(ELLIPSIS)) {
            text = text.replace(ELLIPSIS, ELLIPSIS_SUBST);
        }
        if (text.contains(ELLIPSIS2)) {
            text = text.replace(ELLIPSIS2, ELLIPSIS2_SUBST);
        }
        if (text.contains(ELLIPSIS3)) {
            text = text.replace(ELLIPSIS3, ELLIPSIS3_SUBST);
        }
        if (text.contains(".")) {
            text = DATE_PATTERN.matcher(text).replaceAll("$1\ue004$2\ue004$3");
            text = DOTTED_NUMBERS_PATTERN.matcher(text).replaceAll("$1\ue002$2");
            text = ABBR_DOT_PATTERN4.matcher(text).replaceAll("$1\ue007\ue110$2\ue007");
            text = ABBR_DOT_PATTERN.matcher(text).replaceAll("$1\ue007$2");
            text = ABBR_DOT_PATTERN1.matcher(text).replaceAll("$1\ue007$2");
            text = ABBR_DOT_PATTERN2.matcher(text).replaceAll("$1\ue007$2");
            text = ABBR_DOT_PATTERN5.matcher(text).replaceAll("$1\ue110\ue007");
            text = ABBR_DOT_PATTERN3.matcher(text).replaceAll("$1\ue007$2\ue007");
            text = ABBR_DOT_PATTERN6.matcher(text).replaceAll("$1\ue007");
            text = ABBR_DOT_PATTERN6_2.matcher(text).replaceAll("$1\ue007");
            text = ABBR_DOT_PATTERN7.matcher(text).replaceAll("$1\ue007$2\ue007");
            text = ABBR_DOT_PATTERN8.matcher(text).replaceAll("$1\ue007$2");
        }
        if (text.contains(":")) {
            text = COLON_NUMBERS_PATTERN.matcher(text).replaceAll("$1\ue003$2");
        }
        if (text.contains("(")) {
            text = BRACE_IN_WORD_PATTERN.matcher(text).replaceAll("$1\ue005$2\ue006");
        }
        ArrayList<String> tokenList = new ArrayList<String>();
        StringTokenizer st = new StringTokenizer(text, SPLIT_CHARS, true);
        while (st.hasMoreElements()) {
            String token = st.nextToken();
            if (token.equals(BREAKING_PLACEHOLDER)) continue;
            token = token.replace('\ue001', ',');
            token = token.replace('\ue004', '.');
            token = token.replace('\ue002', '.');
            token = token.replace('\ue007', '.');
            token = token.replace('\ue003', ':');
            token = token.replace('\ue005', '(');
            token = token.replace('\ue006', ')');
            token = token.replaceAll(ELLIPSIS_SUBST, ELLIPSIS);
            token = token.replaceAll(ELLIPSIS2_SUBST, ELLIPSIS2);
            token = token.replaceAll(ELLIPSIS3_SUBST, ELLIPSIS3);
            if (!urls.isEmpty()) {
                for (Map.Entry entry : urls.entrySet()) {
                    token = token.replace((CharSequence)entry.getKey(), (CharSequence)entry.getValue());
                }
            }
            tokenList.add(token);
        }
        return tokenList;
    }

    private static String cleanup(String text) {
        text = text.replace('\u2019', '\'').replace('\u02bc', '\'').replace('\u2018', '\'');
        return text;
    }
}

