/*
 * Decompiled with CFR 0.152.
 */
package org.languagetool.tokenizers.pt;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.StringTokenizer;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.languagetool.tagging.pt.PortugueseTagger;
import org.languagetool.tokenizers.WordTokenizer;

public class PortugueseWordTokenizer
extends WordTokenizer {
    private final PortugueseTagger tagger;
    private static final char DECIMAL_COMMA_SUBST = '\ue001';
    private static final char NON_BREAKING_SPACE_SUBST = '\ue002';
    private static final char NON_BREAKING_DOT_SUBST = '\ue003';
    private static final char NON_BREAKING_COLON_SUBST = '\ue004';
    private static final String HYPHEN_SUBST_TEXT = "\u0001\u0001PT_HYPHEN\u0001\u0001";
    private static final Pattern HYPHEN_SUBST = Pattern.compile("\u0001\u0001PT_HYPHEN\u0001\u0001");
    private static final Pattern DECIMAL_COMMA_PATTERN = Pattern.compile("([\\d]),([\\d])", 66);
    private static final String DECIMAL_COMMA_REPL = "$1\ue001$2";
    private static final Pattern DECIMAL_SPACE_PATTERN = Pattern.compile("(?<=^|[\\s(])\\d{1,3}( \\d{3})+(?:[\ue001\ue003]\\d+)?(?=\\D|$)", 66);
    private static final Pattern DOTTED_NUMBERS_PATTERN = Pattern.compile("([\\d])\\.([\\d])", 66);
    private static final String DOTTED_NUMBERS_REPL = "$1\ue003$2";
    private static final Pattern COLON_NUMBERS_PATTERN = Pattern.compile("([\\d]):([\\d])", 66);
    private static final String COLON_NUMBERS_REPL = "$1\ue004$2";
    private static final Pattern DATE_PATTERN = Pattern.compile("([\\d]{2})\\.([\\d]{2})\\.([\\d]{4})|([\\d]{4})\\.([\\d]{2})\\.([\\d]{2})|([\\d]{4})-([\\d]{2})-([\\d]{2})", 66);
    private static final String DATE_PATTERN_REPL = "$1\ue003$2\ue003$3";
    private static final Pattern DOTTED_ORDINALS_PATTERN = Pattern.compile("([\\d])\\.([ao\u00aa\u00ba\u1d43\u1d52][s\u02e2]?)", 66);
    private static final String DOTTED_ORDINALS_REPL = "$1\ue003$2";
    private static final Pattern HYPHEN_PATTERN = Pattern.compile("([\\p{L}])-([\\p{L}\\d])", 66);
    private static final String HYPHEN_REPL = "$1" + HYPHEN_SUBST + "$2";
    private static final Pattern NEARBY_HYPHENS_PATTERN = Pattern.compile("([\\p{L}])-([\\p{L}])-([\\p{L}])", 66);
    private static final String NEARBY_HYPHENS_REPL = "$1" + HYPHEN_SUBST + "$2" + HYPHEN_SUBST + "$3";
    private final String wordChars = "\u00b0\\^\\-\\p{L}\\d\\u0300-\\u036F\\u00A8\\u2070-\\u209F\ue001\ue003\ue004\ue002" + HYPHEN_SUBST;
    private final String wordCharsLeftEdge = "\u2212@\u20ac\u00a3\\$\u00a2\u00a5\u00a4";
    private final String wordCharsRightEdge = "\u20ac\u00a3\\$%\u2030\u2031\u00ba\u00aa\u1d43\u1d52\u02e2";
    private final Pattern wordPattern = Pattern.compile("[\u2212@\u20ac\u00a3\\$\u00a2\u00a5\u00a4]?[" + this.wordChars + "]+[\u20ac\u00a3\\$%\u2030\u2031\u00ba\u00aa\u1d43\u1d52\u02e2]?|[^" + this.wordChars + "]", 66);

    public PortugueseWordTokenizer() {
        this.tagger = new PortugueseTagger();
    }

    public List<String> tokenize(String text) {
        Matcher spacedDecimalMatcher;
        int dotIndex;
        boolean dotInsideSentence;
        String tokenisedText = text;
        if (tokenisedText.contains(",")) {
            tokenisedText = DECIMAL_COMMA_PATTERN.matcher(tokenisedText).replaceAll(DECIMAL_COMMA_REPL);
        }
        boolean bl = dotInsideSentence = (dotIndex = tokenisedText.indexOf(46)) >= 0 && dotIndex < tokenisedText.length() - 1;
        if (dotInsideSentence) {
            tokenisedText = DATE_PATTERN.matcher(tokenisedText).replaceAll(DATE_PATTERN_REPL);
            tokenisedText = DOTTED_NUMBERS_PATTERN.matcher(tokenisedText).replaceAll("$1\ue003$2");
            tokenisedText = DOTTED_ORDINALS_PATTERN.matcher(tokenisedText).replaceAll("$1\ue003$2");
        }
        if ((spacedDecimalMatcher = DECIMAL_SPACE_PATTERN.matcher(tokenisedText)).find()) {
            StringBuffer sb = new StringBuffer();
            do {
                String splitNumber = spacedDecimalMatcher.group(0);
                String splitNumberAdjusted = splitNumber.replace(' ', '\ue002');
                splitNumberAdjusted = splitNumberAdjusted.replace('\u00a0', '\ue002');
                spacedDecimalMatcher.appendReplacement(sb, splitNumberAdjusted);
            } while (spacedDecimalMatcher.find());
            spacedDecimalMatcher.appendTail(sb);
            tokenisedText = sb.toString();
        }
        if (tokenisedText.contains(":")) {
            tokenisedText = COLON_NUMBERS_PATTERN.matcher(tokenisedText).replaceAll(COLON_NUMBERS_REPL);
        }
        if (tokenisedText.contains("-")) {
            tokenisedText = NEARBY_HYPHENS_PATTERN.matcher(tokenisedText).replaceAll(NEARBY_HYPHENS_REPL);
            tokenisedText = HYPHEN_PATTERN.matcher(tokenisedText).replaceAll(HYPHEN_REPL);
        }
        ArrayList<Object> tokenList = new ArrayList<Object>();
        Matcher tokeniserMatcher = this.wordPattern.matcher(tokenisedText);
        while (tokeniserMatcher.find()) {
            String token = tokeniserMatcher.group();
            if (!tokenList.isEmpty() && token.length() == 1 && token.codePointAt(0) >= 65024 && token.codePointAt(0) <= 65039) {
                tokenList.set(tokenList.size() - 1, (String)tokenList.get(tokenList.size() - 1) + token);
                continue;
            }
            token = token.replace('\ue001', ',');
            token = token.replace('\ue004', ':');
            token = token.replace('\ue002', ' ');
            token = token.replace('\ue003', '.');
            token = token.replace(HYPHEN_SUBST_TEXT, "-");
            tokenList.addAll(this.wordsToAdd(token));
        }
        return this.joinEMailsAndUrls(tokenList);
    }

    private List<String> wordsToAdd(String s) {
        ArrayList<String> l = new ArrayList<String>();
        if (!s.isEmpty()) {
            if (this.isCurrencyExpression(s)) {
                l.addAll(this.splitCurrencyExpression(s));
            } else if (!s.contains("-")) {
                l.add(s);
            } else if (this.tagger.tag(Arrays.asList(s.replace("\u2019", "'"))).get(0).isTagged()) {
                l.add(s);
            } else if (s.equalsIgnoreCase("mers-cov") || s.equalsIgnoreCase("mcgraw-hill") || s.equalsIgnoreCase("sars-cov-2") || s.equalsIgnoreCase("sars-cov") || s.equalsIgnoreCase("ph-metre") || s.equalsIgnoreCase("ph-metres") || s.equalsIgnoreCase("anti-ivg") || s.equalsIgnoreCase("anti-uv") || s.equalsIgnoreCase("anti-vih") || s.equalsIgnoreCase("al-qa\u00efda")) {
                l.add(s);
            } else {
                StringTokenizer st2 = new StringTokenizer(s, "-", true);
                while (st2.hasMoreElements()) {
                    l.add(st2.nextToken());
                }
            }
        }
        return l;
    }
}

