package org.languagetool.tokenizers.pt;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.StringTokenizer;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.languagetool.tagging.pt.PortugueseTagger;
import org.languagetool.tokenizers.WordTokenizer;

/* loaded from: input_file:org/languagetool/tokenizers/pt/PortugueseWordTokenizer.class */
public class PortugueseWordTokenizer extends WordTokenizer {
    private static final char DECIMAL_COMMA_SUBST = 57345;
    private static final char NON_BREAKING_SPACE_SUBST = 57346;
    private static final char NON_BREAKING_DOT_SUBST = 57347;
    private static final char NON_BREAKING_COLON_SUBST = 57348;
    private static final String DECIMAL_COMMA_REPL = "$1\ue001$2";
    private static final String DOTTED_NUMBERS_REPL = "$1\ue003$2";
    private static final String COLON_NUMBERS_REPL = "$1\ue004$2";
    private static final String DATE_PATTERN_REPL = "$1\ue003$2\ue003$3";
    private static final String DOTTED_ORDINALS_REPL = "$1\ue003$2";
    private static final Pattern CURLY_QUOTE = Pattern.compile("’");
    private static final Pattern HYPHEN_SUBST = Pattern.compile("\u0001\u0001PT_HYPHEN\u0001\u0001");
    private static final Pattern DECIMAL_COMMA_PATTERN = Pattern.compile("([\\d]),([\\d])", 66);
    private static final Pattern DECIMAL_SPACE_PATTERN = Pattern.compile("(?<=^|[\\s(])\\d{1,3}( \\d{3})+(?:[\ue001\ue003]\\d+)?(?=\\D|$)", 66);
    private static final Pattern DOTTED_NUMBERS_PATTERN = Pattern.compile("([\\d])\\.([\\d])", 66);
    private static final Pattern COLON_NUMBERS_PATTERN = Pattern.compile("([\\d]):([\\d])", 66);
    private static final Pattern DATE_PATTERN = Pattern.compile("([\\d]{2})\\.([\\d]{2})\\.([\\d]{4})|([\\d]{4})\\.([\\d]{2})\\.([\\d]{2})|([\\d]{4})-([\\d]{2})-([\\d]{2})", 66);
    private static final Pattern DOTTED_ORDINALS_PATTERN = Pattern.compile("([\\d])\\.([aoªºᵃᵒ][sˢ]?)", 66);
    private static final Pattern HYPHEN_PATTERN = Pattern.compile("([\\p{L}])-([\\p{L}\\d])", 66);
    private static final String HYPHEN_REPL = "$1" + HYPHEN_SUBST + "$2";
    private static final Pattern NEARBY_HYPHENS_PATTERN = Pattern.compile("([\\p{L}])-([\\p{L}])-([\\p{L}])", 66);
    private static final String NEARBY_HYPHENS_REPL = "$1" + HYPHEN_SUBST + "$2" + HYPHEN_SUBST + "$3";
    private final String wordChars = "°\\^\\-\\p{L}\\d\\u0300-\\u036F\\u00A8\\u2070-\\u209F\ue001\ue003\ue004\ue002" + HYPHEN_SUBST;
    private final String wordCharsLeftEdge = "−@€£\\$¢¥¤";
    private final String wordCharsRightEdge = "€£\\$%‰‱ºªᵃᵒˢ";
    private final Pattern wordPattern = Pattern.compile("[−@€£\\$¢¥¤]?[" + this.wordChars + "]+[€£\\$%‰‱ºªᵃᵒˢ]?|[^" + this.wordChars + "]", 66);
    private final PortugueseTagger tagger = new PortugueseTagger();

    public List<String> tokenize(String str) {
        String str2 = str;
        if (str2.contains(",")) {
            str2 = DECIMAL_COMMA_PATTERN.matcher(str2).replaceAll(DECIMAL_COMMA_REPL);
        }
        int indexOf = str2.indexOf(46);
        if (indexOf >= 0 && indexOf < str2.length() - 1) {
            str2 = DOTTED_ORDINALS_PATTERN.matcher(DOTTED_NUMBERS_PATTERN.matcher(DATE_PATTERN.matcher(str2).replaceAll(DATE_PATTERN_REPL)).replaceAll("$1\ue003$2")).replaceAll("$1\ue003$2");
        }
        Matcher matcher = DECIMAL_SPACE_PATTERN.matcher(str2);
        if (matcher.find()) {
            StringBuffer stringBuffer = new StringBuffer();
            do {
                matcher.appendReplacement(stringBuffer, matcher.group(0).replace(' ', (char) 57346).replace((char) 160, (char) 57346));
            } while (matcher.find());
            matcher.appendTail(stringBuffer);
            str2 = stringBuffer.toString();
        }
        if (str2.contains(":")) {
            str2 = COLON_NUMBERS_PATTERN.matcher(str2).replaceAll(COLON_NUMBERS_REPL);
        }
        if (str2.contains("-")) {
            str2 = HYPHEN_PATTERN.matcher(NEARBY_HYPHENS_PATTERN.matcher(str2).replaceAll(NEARBY_HYPHENS_REPL)).replaceAll(HYPHEN_REPL);
        }
        ArrayList arrayList = new ArrayList();
        Matcher matcher2 = this.wordPattern.matcher(str2);
        while (matcher2.find()) {
            String group = matcher2.group();
            if (arrayList.isEmpty() || group.length() != 1 || group.codePointAt(0) < 65024 || group.codePointAt(0) > 65039) {
                arrayList.addAll(wordsToAdd(HYPHEN_SUBST.matcher(group.replace((char) 57345, ',').replace((char) 57348, ':').replace((char) 57346, ' ').replace((char) 57347, '.')).replaceAll("-")));
            } else {
                arrayList.set(arrayList.size() - 1, ((String) arrayList.get(arrayList.size() - 1)) + group);
            }
        }
        return joinEMailsAndUrls(arrayList);
    }

    private List<String> wordsToAdd(String str) {
        ArrayList arrayList = new ArrayList();
        if (!str.isEmpty()) {
            if (isCurrencyExpression(str)) {
                arrayList.addAll(splitCurrencyExpression(str));
            } else if (!str.contains("-")) {
                arrayList.add(str);
            } else if (this.tagger.tag(Arrays.asList(CURLY_QUOTE.matcher(str).replaceAll("'"))).get(0).isTagged()) {
                arrayList.add(str);
            } else if (str.equalsIgnoreCase("mers-cov") || str.equalsIgnoreCase("mcgraw-hill") || str.equalsIgnoreCase("sars-cov-2") || str.equalsIgnoreCase("sars-cov") || str.equalsIgnoreCase("ph-metre") || str.equalsIgnoreCase("ph-metres") || str.equalsIgnoreCase("anti-ivg") || str.equalsIgnoreCase("anti-uv") || str.equalsIgnoreCase("anti-vih") || str.equalsIgnoreCase("al-qaïda")) {
                arrayList.add(str);
            } else {
                StringTokenizer stringTokenizer = new StringTokenizer(str, "-", true);
                while (stringTokenizer.hasMoreElements()) {
                    arrayList.add(stringTokenizer.nextToken());
                }
            }
        }
        return arrayList;
    }
}
