/*
 * Decompiled with CFR 0.152.
 */
package org.languagetool.dev.errorcorpus;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.StringTokenizer;
import org.apache.commons.io.IOUtils;
import org.languagetool.dev.errorcorpus.Error;
import org.languagetool.dev.errorcorpus.ErrorCorpus;
import org.languagetool.dev.errorcorpus.ErrorSentence;
import org.languagetool.markup.AnnotatedText;
import org.languagetool.markup.AnnotatedTextBuilder;

public class PedlerCorpus
implements ErrorCorpus {
    private static final String NORMALIZE_REGEX = "\\s*<ERR targ\\s*=\\s*([^>]*?)\\s*>\\s*(.*?)\\s*</ERR>\\s*";
    private final List<String> lines = new ArrayList<String>();
    private int pos;

    public PedlerCorpus(File dir) throws IOException {
        File[] files = dir.listFiles();
        if (files == null) {
            throw new RuntimeException("Directory not found or is not a directory: " + dir);
        }
        for (File file : files) {
            if (!file.getName().endsWith(".txt")) {
                System.out.println("Ignoring " + file + ", does not match *.txt");
                continue;
            }
            try (FileInputStream fis = new FileInputStream(file);){
                this.lines.addAll(IOUtils.readLines((InputStream)fis));
            }
        }
    }

    @Override
    public Iterator<ErrorSentence> iterator() {
        return new Iterator<ErrorSentence>(){

            @Override
            public boolean hasNext() {
                return PedlerCorpus.this.pos < PedlerCorpus.this.lines.size();
            }

            @Override
            public ErrorSentence next() {
                String line = PedlerCorpus.this.lines.get(PedlerCorpus.this.pos++);
                ErrorSentence sentence = PedlerCorpus.this.getIncorrectSentence(line);
                return sentence;
            }

            @Override
            public void remove() {
                throw new UnsupportedOperationException();
            }
        };
    }

    private ErrorSentence getIncorrectSentence(String line) {
        String normalized = line.replaceAll(NORMALIZE_REGEX, " <ERR targ=$1>$2</ERR> ").replaceAll("\\s+", " ").trim();
        ArrayList<Error> errors = new ArrayList<Error>();
        int startPos = 0;
        while (normalized.indexOf("<ERR targ=", startPos) != -1) {
            int startTagStart = normalized.indexOf("<ERR targ=", startPos);
            int startTagEnd = normalized.indexOf(">", startTagStart);
            int endTagStart = normalized.indexOf("</ERR>", startTagStart);
            int correctionEnd = normalized.indexOf(">", startTagStart);
            String correction = normalized.substring(startTagStart + "<ERR targ=".length(), correctionEnd);
            errors.add(new Error(startTagEnd + 1, endTagStart, correction));
            startPos = startTagStart + 1;
        }
        return new ErrorSentence(normalized, this.makeAnnotatedText(normalized), errors);
    }

    private AnnotatedText makeAnnotatedText(String pseudoXml) {
        AnnotatedTextBuilder builder = new AnnotatedTextBuilder();
        StringTokenizer tokenizer = new StringTokenizer(pseudoXml, "<>", true);
        boolean inMarkup = false;
        while (tokenizer.hasMoreTokens()) {
            String part = tokenizer.nextToken();
            if (part.startsWith("<")) {
                builder.addMarkup(part);
                inMarkup = true;
                continue;
            }
            if (part.startsWith(">")) {
                inMarkup = false;
                builder.addMarkup(part);
                continue;
            }
            if (inMarkup) {
                builder.addMarkup(part);
                continue;
            }
            builder.addText(part);
        }
        return builder.build();
    }
}

