package de.dfki.lt.tools.tokenizer;

import de.dfki.lt.tools.tokenizer.annotate.AnnotatedString;
import de.dfki.lt.tools.tokenizer.annotate.FastAnnotatedString;
import de.dfki.lt.tools.tokenizer.exceptions.ProcessingException;
import de.dfki.lt.tools.tokenizer.output.Outputter;
import de.dfki.lt.tools.tokenizer.output.Paragraph;
import de.dfki.lt.tools.tokenizer.regexp.Match;
import de.dfki.lt.tools.tokenizer.regexp.RegExp;
import java.io.File;
import java.io.IOException;
import java.nio.file.Paths;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Set;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:de/dfki/lt/tools/tokenizer/JTok.class */
public class JTok {
    public static final String CLASS_ANNO = "class";
    public static final String BORDER_ANNO = "border";
    public static final String TU_BORDER = "tu";
    public static final String P_BORDER = "p";
    private static final Logger logger = LoggerFactory.getLogger(JTok.class);
    private static final String DEFAULT = "default";
    private Map<String, LanguageResource> langResources;

    public JTok() throws IOException {
        Properties properties = new Properties();
        properties.load(FileTools.openResourceFileAsStream(Paths.get("jtok/jtok.cfg", new String[0])));
        init(properties);
    }

    public JTok(Properties properties) {
        init(properties);
    }

    private void init(Properties properties) {
        if (properties.get(DEFAULT) == null) {
            logger.error("missing default language resources");
        }
        this.langResources = new HashMap();
        for (Map.Entry entry : properties.entrySet()) {
            String str = (String) entry.getKey();
            String str2 = (String) entry.getValue();
            logger.info(String.format("loading language resources for %s from %s", str, str2));
            this.langResources.put(str, new LanguageResource(str, str2));
        }
    }

    public LanguageResource getLanguageResource(String str) {
        LanguageResource languageResource = this.langResources.get(str);
        if (languageResource != null) {
            return languageResource;
        }
        logger.info(String.format("language %s not supported, using default configuration", str));
        return this.langResources.get(DEFAULT);
    }

    public AnnotatedString tokenize(String str, String str2) {
        LanguageResource languageResource = getLanguageResource(str2);
        FastAnnotatedString fastAnnotatedString = new FastAnnotatedString(str);
        identifyTokens(fastAnnotatedString, languageResource);
        identifyPunct(fastAnnotatedString, languageResource);
        identifyAbbrev(fastAnnotatedString, languageResource);
        identifyTus(fastAnnotatedString, languageResource);
        return fastAnnotatedString;
    }

    private void identifyTokens(AnnotatedString annotatedString, LanguageResource languageResource) {
        int i = 0;
        boolean z = false;
        String tagName = languageResource.getClassesRoot().getTagName();
        char first = annotatedString.first();
        while (true) {
            char c = first;
            if (c == 65535) {
                break;
            }
            if (Character.isWhitespace(c) || c == 160) {
                if (z) {
                    annotate(annotatedString, CLASS_ANNO, tagName, i, annotatedString.getIndex(), annotatedString.substring(i, annotatedString.getIndex()), languageResource);
                    z = false;
                }
            } else if (!z) {
                z = true;
                i = annotatedString.getIndex();
            }
            first = annotatedString.next();
        }
        if (z) {
            annotate(annotatedString, CLASS_ANNO, tagName, i, annotatedString.getIndex(), annotatedString.substring(i, annotatedString.getIndex()), languageResource);
        }
    }

    private void identifyPunct(AnnotatedString annotatedString, LanguageResource languageResource) {
        RegExp allPunctMatcher = languageResource.getAllPunctMatcher();
        RegExp internalMatcher = languageResource.getInternalMatcher();
        String tagName = languageResource.getClassesRoot().getTagName();
        char index = annotatedString.setIndex(0);
        if (annotatedString.getAnnotation(CLASS_ANNO) == null) {
            index = annotatedString.setIndex(annotatedString.findNextAnnotation(CLASS_ANNO));
        }
        while (index != 65535) {
            if (annotatedString.getAnnotation(CLASS_ANNO) == null) {
                index = annotatedString.setIndex(annotatedString.findNextAnnotation(CLASS_ANNO));
            } else if (((String) annotatedString.getAnnotation(CLASS_ANNO)) != tagName) {
                index = annotatedString.setIndex(annotatedString.findNextAnnotation(CLASS_ANNO));
            } else {
                int findNextAnnotation = annotatedString.findNextAnnotation(CLASS_ANNO);
                splitPunctuation(annotatedString, languageResource);
                if (((String) annotatedString.getAnnotation(CLASS_ANNO)) != tagName) {
                    index = annotatedString.setIndex(findNextAnnotation);
                } else {
                    splitClitics(annotatedString, languageResource);
                    String str = (String) annotatedString.getAnnotation(CLASS_ANNO);
                    if (str != tagName) {
                        index = annotatedString.setIndex(findNextAnnotation);
                    } else {
                        int index2 = annotatedString.getIndex();
                        String substring = annotatedString.substring(index2, annotatedString.getRunLimit(CLASS_ANNO));
                        List<Match> allMatches = allPunctMatcher.getAllMatches(substring);
                        if (allMatches.size() == 0) {
                            index = annotatedString.setIndex(findNextAnnotation);
                        } else {
                            int i = 0;
                            for (int i2 = 0; i2 < allMatches.size(); i2++) {
                                Match match = allMatches.get(i2);
                                if (i != match.getStartIndex()) {
                                    if (!internalMatcher.matches(match.getImage()) || !hasRightContextEnd(match, allMatches, substring, i2)) {
                                        annotate(annotatedString, CLASS_ANNO, str, index2 + i, index2 + match.getStartIndex(), substring.substring(i, match.getStartIndex()), languageResource);
                                        i = match.getStartIndex();
                                    }
                                }
                                annotatedString.annotate(CLASS_ANNO, identifyPunctClass(match, null, substring, languageResource), index2 + i, index2 + match.getEndIndex());
                                i = match.getEndIndex();
                            }
                            if (i != substring.length()) {
                                annotate(annotatedString, CLASS_ANNO, str, index2 + i, index2 + substring.length(), substring.substring(i), languageResource);
                            }
                            index = annotatedString.setIndex(findNextAnnotation);
                        }
                    }
                }
            }
        }
    }

    private void splitPunctuation(AnnotatedString annotatedString, LanguageResource languageResource) {
        RegExp allPunctMatcher = languageResource.getAllPunctMatcher();
        String tagName = languageResource.getClassesRoot().getTagName();
        int index = annotatedString.getIndex();
        int runLimit = annotatedString.getRunLimit(CLASS_ANNO);
        String substring = annotatedString.substring(index, runLimit);
        String str = (String) annotatedString.getAnnotation(CLASS_ANNO);
        Match starts = allPunctMatcher.starts(substring);
        while (true) {
            Match match = starts;
            if (match == null) {
                break;
            }
            annotatedString.annotate(CLASS_ANNO, identifyPunctClass(match, null, substring, languageResource), index + match.getStartIndex(), index + match.getEndIndex());
            index += match.getEndIndex();
            substring = annotatedString.substring(index, runLimit);
            annotatedString.setIndex(index);
            if (substring.length() > 0) {
                annotate(annotatedString, CLASS_ANNO, str, index, runLimit, substring, languageResource);
                str = (String) annotatedString.getAnnotation(CLASS_ANNO);
                if (str != tagName) {
                    break;
                } else {
                    starts = allPunctMatcher.starts(substring);
                }
            } else {
                starts = null;
            }
        }
        Match ends = allPunctMatcher.ends(substring);
        while (true) {
            Match match2 = ends;
            if (match2 == null) {
                return;
            }
            annotatedString.annotate(CLASS_ANNO, identifyPunctClass(match2, null, substring, languageResource), index + match2.getStartIndex(), index + match2.getEndIndex());
            int startIndex = index + match2.getStartIndex();
            substring = annotatedString.substring(index, startIndex);
            if (substring.length() > 0) {
                annotate(annotatedString, CLASS_ANNO, str, index, startIndex, substring, languageResource);
                str = (String) annotatedString.getAnnotation(CLASS_ANNO);
                if (str != tagName) {
                    return;
                } else {
                    ends = allPunctMatcher.ends(substring);
                }
            } else {
                ends = null;
            }
        }
    }

    private void splitClitics(AnnotatedString annotatedString, LanguageResource languageResource) {
        RegExp procliticsMatcher = languageResource.getProcliticsMatcher();
        RegExp encliticsMatcher = languageResource.getEncliticsMatcher();
        String tagName = languageResource.getClassesRoot().getTagName();
        int index = annotatedString.getIndex();
        int runLimit = annotatedString.getRunLimit(CLASS_ANNO);
        String substring = annotatedString.substring(index, runLimit);
        String str = (String) annotatedString.getAnnotation(CLASS_ANNO);
        Match starts = procliticsMatcher.starts(substring);
        while (true) {
            Match match = starts;
            if (match == null) {
                break;
            }
            annotatedString.annotate(CLASS_ANNO, identifyClass(match.getImage(), procliticsMatcher, languageResource.getClitDescr()), index + match.getStartIndex(), index + match.getEndIndex());
            index += match.getEndIndex();
            substring = annotatedString.substring(index, runLimit);
            annotatedString.setIndex(index);
            if (substring.length() > 0) {
                annotate(annotatedString, CLASS_ANNO, str, index, runLimit, substring, languageResource);
                str = (String) annotatedString.getAnnotation(CLASS_ANNO);
                if (str != tagName) {
                    break;
                } else {
                    starts = procliticsMatcher.starts(substring);
                }
            } else {
                starts = null;
            }
        }
        Match ends = encliticsMatcher.ends(substring);
        while (true) {
            Match match2 = ends;
            if (match2 == null) {
                return;
            }
            annotatedString.annotate(CLASS_ANNO, identifyClass(match2.getImage(), encliticsMatcher, languageResource.getClitDescr()), index + match2.getStartIndex(), index + match2.getEndIndex());
            int startIndex = index + match2.getStartIndex();
            String substring2 = annotatedString.substring(index, startIndex);
            if (substring2.length() > 0) {
                annotate(annotatedString, CLASS_ANNO, str, index, startIndex, substring2, languageResource);
                str = (String) annotatedString.getAnnotation(CLASS_ANNO);
                if (str != tagName) {
                    return;
                } else {
                    ends = encliticsMatcher.ends(substring2);
                }
            } else {
                ends = null;
            }
        }
    }

    private boolean hasRightContextEnd(Match match, List<Match> list, String str, int i) {
        return i < list.size() - 1 ? list.get(i + 1).getStartIndex() != match.getEndIndex() : match.getEndIndex() != str.length();
    }

    private void annotate(AnnotatedString annotatedString, String str, Object obj, int i, int i2, String str2, LanguageResource languageResource) {
        RegExp allClassesMatcher = languageResource.getAllClassesMatcher();
        if (allClassesMatcher.matches(str2)) {
            annotatedString.annotate(str, identifyClass(str2, allClassesMatcher, languageResource.getClassesDescr()), i, i2);
        } else {
            annotatedString.annotate(str, obj, i, i2);
        }
    }

    private String identifyPunctClass(Match match, RegExp regExp, String str, LanguageResource languageResource) {
        String identifyClass = identifyClass(match.getImage(), regExp, languageResource.getPunctDescr());
        if (languageResource.isAncestor("OPENCLOSE_PUNCT", identifyClass)) {
            int endIndex = match.getEndIndex();
            if (endIndex >= str.length() || !Character.isLetter(str.charAt(endIndex))) {
                identifyClass = PunctDescription.CLOSE_PUNCT;
            } else {
                int startIndex = match.getStartIndex() - 1;
                if (startIndex < 0 || !Character.isLetter(str.charAt(startIndex))) {
                    identifyClass = PunctDescription.OPEN_PUNCT;
                }
            }
        }
        return identifyClass;
    }

    private void identifyAbbrev(AnnotatedString annotatedString, LanguageResource languageResource) {
        RegExp allAbbrevMatcher = languageResource.getAllAbbrevMatcher();
        Map<String, Set<String>> abbrevLists = languageResource.getAbbrevLists();
        char index = annotatedString.setIndex(0);
        if (annotatedString.getAnnotation(CLASS_ANNO) == null) {
            index = annotatedString.setIndex(annotatedString.findNextAnnotation(CLASS_ANNO));
        }
        while (index != 65535) {
            int runLimit = annotatedString.getRunLimit(CLASS_ANNO);
            int index2 = annotatedString.getIndex();
            index = annotatedString.setIndex(annotatedString.findNextAnnotation(CLASS_ANNO));
            if (index == '.' && runLimit == annotatedString.getIndex()) {
                int i = runLimit + 1;
                String substring = annotatedString.substring(index2, i);
                int lastIndexOf = substring.lastIndexOf("-");
                if (lastIndexOf != -1) {
                    String substring2 = substring.substring(lastIndexOf + 1);
                    if (substring2.matches("[^0-9]{2,}")) {
                        substring = substring2;
                    }
                }
                boolean z = false;
                Iterator<Map.Entry<String, Set<String>>> it = abbrevLists.entrySet().iterator();
                while (true) {
                    if (!it.hasNext()) {
                        break;
                    }
                    Map.Entry<String, Set<String>> next = it.next();
                    String key = next.getKey();
                    if (next.getValue().contains(substring)) {
                        annotatedString.annotate(CLASS_ANNO, key, index2, i);
                        z = true;
                        break;
                    }
                }
                if (!z && allAbbrevMatcher.matches(substring)) {
                    annotatedString.annotate(CLASS_ANNO, identifyClass(substring, allAbbrevMatcher, languageResource.getAbbrevDescr()), index2, i);
                }
            }
        }
    }

    private void identifyTus(AnnotatedString annotatedString, LanguageResource languageResource) {
        RegExp internalTuMatcher = languageResource.getInternalTuMatcher();
        boolean z = false;
        boolean z2 = false;
        char index = annotatedString.setIndex(0);
        while (index != 65535) {
            int runStart = annotatedString.getRunStart(CLASS_ANNO);
            int runLimit = annotatedString.getRunLimit(CLASS_ANNO);
            if (annotatedString.getAnnotation(CLASS_ANNO) != null) {
                if (z) {
                    if (!languageResource.isAncestor(PunctDescription.TERM_PUNCT, (String) annotatedString.getAnnotation(CLASS_ANNO)) && !languageResource.isAncestor(PunctDescription.TERM_PUNCT_P, (String) annotatedString.getAnnotation(CLASS_ANNO)) && !languageResource.isAncestor(PunctDescription.CLOSE_PUNCT, (String) annotatedString.getAnnotation(CLASS_ANNO)) && !languageResource.isAncestor(PunctDescription.CLOSE_BRACKET, (String) annotatedString.getAnnotation(CLASS_ANNO))) {
                        if (Character.isLowerCase(index) || internalTuMatcher.matches(annotatedString.substring(annotatedString.getIndex(), annotatedString.getIndex() + 1))) {
                            z = false;
                        } else {
                            annotatedString.annotate(BORDER_ANNO, "tu", runStart, runStart + 1);
                            z = false;
                        }
                    }
                } else if (z2) {
                    if (languageResource.getNonCapTerms().contains(annotatedString.substring(runStart, runLimit)) || languageResource.isAncestor(PunctDescription.OPEN_PUNCT, (String) annotatedString.getAnnotation(CLASS_ANNO))) {
                        annotatedString.annotate(BORDER_ANNO, "tu", runStart, runStart + 1);
                    }
                    z2 = false;
                } else if (languageResource.isAncestor(PunctDescription.TERM_PUNCT, (String) annotatedString.getAnnotation(CLASS_ANNO)) || languageResource.isAncestor(PunctDescription.TERM_PUNCT_P, (String) annotatedString.getAnnotation(CLASS_ANNO))) {
                    z = true;
                } else if (languageResource.isAncestor(AbbrevDescription.B_ABBREVIATION, (String) annotatedString.getAnnotation(CLASS_ANNO))) {
                    z2 = true;
                }
                index = annotatedString.setIndex(runLimit);
            } else if (isParagraphChange(annotatedString.substring(runStart, runLimit))) {
                z = false;
                z2 = false;
                index = annotatedString.setIndex(runLimit);
                if (index != 65535) {
                    annotatedString.annotate(BORDER_ANNO, "p", annotatedString.getIndex(), annotatedString.getIndex() + 1);
                }
            } else {
                index = annotatedString.setIndex(runLimit);
            }
        }
    }

    private boolean isParagraphChange(String str) {
        int length = str.length();
        for (int i = 0; i < length; i++) {
            char charAt = str.charAt(i);
            if ('\n' == charAt || '\r' == charAt) {
                for (int i2 = i + 1; i2 < length; i2++) {
                    if (charAt == str.charAt(i2)) {
                        return true;
                    }
                }
            }
        }
        return false;
    }

    private String identifyClass(String str, RegExp regExp, Description description) {
        String str2;
        if (regExp != null && (str2 = description.getRegExpMap().get(regExp)) != null) {
            return str2;
        }
        for (Map.Entry<String, RegExp> entry : description.getDefinitionsMap().entrySet()) {
            String key = entry.getKey();
            if (entry.getValue().matches(str)) {
                return key;
            }
        }
        throw new ProcessingException(String.format("could not find class for %s", str));
    }

    public static void main(String[] strArr) {
        if (strArr.length != 2 && strArr.length != 3) {
            System.out.format("This method needs two arguments:%n- a file name for the document to tokenize%n- the language of the document%n- an optional encoding to use (default is UTF-8)", new Object[0]);
            System.exit(1);
        }
        String str = null;
        try {
            str = FileTools.readFileAsString(new File(strArr[0]), strArr.length == 3 ? strArr[2] : "UTF-8");
        } catch (IOException e) {
            e.printStackTrace();
            System.exit(1);
        }
        try {
            Iterator<Paragraph> it = Outputter.createParagraphs(new JTok().tokenize(str, strArr[1])).iterator();
            while (it.hasNext()) {
                System.out.println(it.next());
            }
        } catch (IOException e2) {
            logger.error(e2.getLocalizedMessage(), e2);
        }
    }
}
