/*
 * Decompiled with CFR 0.152.
 */
package org.apache.lucene.benchmark.utils;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileFilter;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.nio.charset.StandardCharsets;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class ExtractReuters {
    private File reutersDir;
    private File outputDir;
    private static final String LINE_SEPARATOR = System.getProperty("line.separator");
    Pattern EXTRACTION_PATTERN = Pattern.compile("<TITLE>(.*?)</TITLE>|<DATE>(.*?)</DATE>|<BODY>(.*?)</BODY>");
    private static String[] META_CHARS = new String[]{"&", "<", ">", "\"", "'"};
    private static String[] META_CHARS_SERIALIZATIONS = new String[]{"&amp;", "&lt;", "&gt;", "&quot;", "&apos;"};

    public ExtractReuters(File reutersDir, File outputDir) {
        this.reutersDir = reutersDir;
        this.outputDir = outputDir;
        System.out.println("Deleting all files in " + outputDir);
        for (File f : outputDir.listFiles()) {
            f.delete();
        }
    }

    public void extract() {
        File[] sgmFiles = this.reutersDir.listFiles(new FileFilter(){

            @Override
            public boolean accept(File file) {
                return file.getName().endsWith(".sgm");
            }
        });
        if (sgmFiles != null && sgmFiles.length > 0) {
            for (File sgmFile : sgmFiles) {
                this.extractFile(sgmFile);
            }
        } else {
            System.err.println("No .sgm files in " + this.reutersDir);
        }
    }

    protected void extractFile(File sgmFile) {
        try {
            BufferedReader reader = new BufferedReader(new InputStreamReader((InputStream)new FileInputStream(sgmFile), StandardCharsets.UTF_8));
            StringBuilder buffer = new StringBuilder(1024);
            StringBuilder outBuffer = new StringBuilder(1024);
            String line = null;
            int docNumber = 0;
            while ((line = reader.readLine()) != null) {
                if (line.indexOf("</REUTERS") == -1) {
                    buffer.append(line).append(' ');
                    continue;
                }
                Matcher matcher = this.EXTRACTION_PATTERN.matcher(buffer);
                while (matcher.find()) {
                    for (int i = 1; i <= matcher.groupCount(); ++i) {
                        if (matcher.group(i) == null) continue;
                        outBuffer.append(matcher.group(i));
                    }
                    outBuffer.append(LINE_SEPARATOR).append(LINE_SEPARATOR);
                }
                String out = outBuffer.toString();
                for (int i = 0; i < META_CHARS_SERIALIZATIONS.length; ++i) {
                    out = out.replaceAll(META_CHARS_SERIALIZATIONS[i], META_CHARS[i]);
                }
                File outFile = new File(this.outputDir, sgmFile.getName() + "-" + docNumber++ + ".txt");
                OutputStreamWriter writer = new OutputStreamWriter((OutputStream)new FileOutputStream(outFile), StandardCharsets.UTF_8);
                writer.write(out);
                writer.close();
                outBuffer.setLength(0);
                buffer.setLength(0);
            }
            reader.close();
        }
        catch (IOException e) {
            throw new RuntimeException(e);
        }
    }

    public static void main(String[] args) {
        if (args.length != 2) {
            ExtractReuters.usage("Wrong number of arguments (" + args.length + ")");
            return;
        }
        File reutersDir = new File(args[0]);
        if (!reutersDir.exists()) {
            ExtractReuters.usage("Cannot find Path to Reuters SGM files (" + reutersDir + ")");
            return;
        }
        File outputDir = new File(args[1]);
        outputDir = new File(outputDir.getAbsolutePath() + "-tmp");
        outputDir.mkdirs();
        ExtractReuters extractor = new ExtractReuters(reutersDir, outputDir);
        extractor.extract();
        outputDir.renameTo(new File(args[1]));
    }

    private static void usage(String msg) {
        System.err.println("Usage: " + msg + " :: java -cp <...> org.apache.lucene.benchmark.utils.ExtractReuters <Path to Reuters SGM files> <Output Path>");
    }
}

