package org.apache.tika.parser.ocr;

import java.awt.Image;
import java.awt.image.BufferedImage;
import java.awt.image.ImageObserver;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.LinkOption;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.nio.file.StandardCopyOption;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.TimeUnit;
import javax.imageio.ImageIO;
import org.apache.commons.io.FilenameUtils;
import org.apache.commons.io.IOUtils;
import org.apache.fontbox.ttf.HeaderTable;
import org.apache.poi.openxml4j.opc.ContentTypes;
import org.apache.tika.config.Field;
import org.apache.tika.config.Initializable;
import org.apache.tika.config.InitializableProblemHandler;
import org.apache.tika.config.Param;
import org.apache.tika.config.TikaTaskTimeout;
import org.apache.tika.exception.TikaConfigException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TemporaryResources;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Property;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AbstractExternalProcessParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.external.ExternalParser;
import org.apache.tika.parser.ocr.TesseractOCRConfig;
import org.apache.tika.sax.XHTMLContentHandler;
import org.apache.tika.utils.StringUtils;
import org.apache.tika.utils.XMLReaderUtils;
import org.apache.xml.serialize.HTMLSerializer;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.AttributesImpl;
import org.xml.sax.helpers.DefaultHandler;

/* loaded from: input_file:tika-parsers-standard-package-2.4.1.jar:org/apache/tika/parser/ocr/TesseractOCRParser.class */
public class TesseractOCRParser extends AbstractExternalProcessParser implements Initializable {
    public static final String TESS_META = "tess:";
    private static final String TESSDATA_PREFIX = "TESSDATA_PREFIX";
    private static final String OCR = "ocr-";
    private static final long serialVersionUID = -8167538283213097265L;
    private final Set<String> langs = new HashSet();
    private final TesseractOCRConfig defaultConfig = new TesseractOCRConfig();
    private String tesseractPath = "";
    private String tessdataPath = "";
    private String imageMagickPath = "";
    private boolean preloadLangs = false;
    private boolean hasTesseract;
    private boolean hasImageMagick;
    private ImagePreprocessor imagePreprocessor;
    public static final Property IMAGE_ROTATION = Property.externalRealSeq("tess:rotation");
    public static final Property IMAGE_MAGICK = Property.externalBooleanSeq("tess:image_magick_processed");
    public static final Property PSM0_PAGE_NUMBER = Property.externalInteger("tess:page_number");
    public static final Property PSM0_ORIENTATION = Property.externalInteger("tess:orientation");
    public static final Property PSM0_ROTATE = Property.externalInteger("tess:rotate");
    public static final Property PSM0_ORIENTATION_CONFIDENCE = Property.externalReal("tess:orientation_confidence");
    public static final Property PSM0_SCRIPT = Property.externalText("tess:script");
    public static final Property PSM0_SCRIPT_CONFIDENCE = Property.externalReal("tess:script_confidence");
    private static final Logger LOG = LoggerFactory.getLogger(TesseractOCRParser.class);
    private static final Object[] LOCK = new Object[0];
    private static final Set<MediaType> SUPPORTED_TYPES = Collections.unmodifiableSet(new HashSet(Arrays.asList(MediaType.image("ocr-png"), MediaType.image("ocr-jpeg"), MediaType.image("ocr-tiff"), MediaType.image("ocr-bmp"), MediaType.image("ocr-gif"), MediaType.image("jp2"), MediaType.image("jpx"), MediaType.image("x-portable-pixmap"), MediaType.image("ocr-jp2"), MediaType.image("ocr-jpx"), MediaType.image("ocr-x-portable-pixmap"))));
    private static volatile boolean HAS_WARNED = false;

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: input_file:tika-parsers-standard-package-2.4.1.jar:org/apache/tika/parser/ocr/TesseractOCRParser$HOCRPassThroughHandler.class */
    public static class HOCRPassThroughHandler extends DefaultHandler {
        public static final Set<String> IGNORE = unmodifiableSet("html", HeaderTable.TAG, "title", "meta", "body");
        private final ContentHandler xhtml;

        public HOCRPassThroughHandler(ContentHandler contentHandler) {
            this.xhtml = contentHandler;
        }

        private static Set<String> unmodifiableSet(String... strArr) {
            return Collections.unmodifiableSet(new HashSet(Arrays.asList(strArr)));
        }

        @Override // org.xml.sax.helpers.DefaultHandler, org.xml.sax.ContentHandler
        public void startElement(String str, String str2, String str3, Attributes attributes) throws SAXException {
            if (IGNORE.contains(str3)) {
                return;
            }
            this.xhtml.startElement(str, str2, str3, attributes);
        }

        @Override // org.xml.sax.helpers.DefaultHandler, org.xml.sax.ContentHandler
        public void endElement(String str, String str2, String str3) throws SAXException {
            if (IGNORE.contains(str3)) {
                return;
            }
            this.xhtml.endElement(str, str2, str3);
        }

        @Override // org.xml.sax.helpers.DefaultHandler, org.xml.sax.ContentHandler
        public void characters(char[] cArr, int i, int i2) throws SAXException {
            this.xhtml.characters(cArr, i, i2);
        }
    }

    public static String getImageMagickProg() {
        return System.getProperty("os.name").startsWith("Windows") ? "magick" : "convert";
    }

    public static String getTesseractProg() {
        return System.getProperty("os.name").startsWith("Windows") ? "tesseract.exe" : "tesseract";
    }

    public Set<MediaType> getSupportedTypes(ParseContext parseContext) {
        TesseractOCRConfig tesseractOCRConfig = (TesseractOCRConfig) parseContext.get(TesseractOCRConfig.class);
        return (!this.hasTesseract || (tesseractOCRConfig != null && tesseractOCRConfig.isSkipOcr())) ? Collections.emptySet() : SUPPORTED_TYPES;
    }

    private void setEnv(ProcessBuilder processBuilder) {
        Map<String, String> environment = processBuilder.environment();
        if (!StringUtils.isBlank(getTessdataPath())) {
            environment.put(TESSDATA_PREFIX, getTessdataPath());
        } else {
            if (StringUtils.isBlank(getTesseractPath())) {
                return;
            }
            environment.put(TESSDATA_PREFIX, getTesseractPath() + "tessdata");
        }
    }

    public boolean hasTesseract() throws TikaConfigException {
        String str = getTesseractPath() + getTesseractProg();
        if (!StringUtils.isBlank(this.tesseractPath) && !Files.isDirectory(Paths.get(this.tesseractPath, new String[0]), new LinkOption[0])) {
            throw new TikaConfigException("tesseractPath (" + this.tesseractPath + ") doesn't point to an existing directory");
        }
        String[] strArr = {str};
        boolean check = ExternalParser.check(strArr, new int[0]);
        LOG.debug("hasTesseract (path: " + Arrays.toString(strArr) + "): " + check);
        return check;
    }

    boolean hasImageMagick() throws TikaConfigException {
        String str = this.imageMagickPath + getImageMagickProg();
        if (!StringUtils.isBlank(this.imageMagickPath) && !Files.isDirectory(Paths.get(this.imageMagickPath, new String[0]), new LinkOption[0])) {
            throw new TikaConfigException("imageMagickPath (" + this.imageMagickPath + ") doesn't point to an existing directory");
        }
        boolean check = ExternalParser.check(new String[]{str}, new int[0]);
        if (!check) {
            LOG.debug("ImageMagick does not appear to be installed (commandline: " + str + ")");
        }
        return check;
    }

    public void parse(Image image, ContentHandler contentHandler, Metadata metadata, ParseContext parseContext) throws IOException, SAXException, TikaException {
        TemporaryResources temporaryResources = new TemporaryResources();
        Throwable th = null;
        try {
            BufferedImage bufferedImage = new BufferedImage(image.getWidth((ImageObserver) null), image.getHeight((ImageObserver) null), 1);
            File createTemporaryFile = temporaryResources.createTemporaryFile();
            FileOutputStream fileOutputStream = new FileOutputStream(createTemporaryFile);
            Throwable th2 = null;
            try {
                try {
                    ImageIO.write(bufferedImage, ContentTypes.EXTENSION_PNG, fileOutputStream);
                    if (fileOutputStream != null) {
                        if (0 != 0) {
                            try {
                                fileOutputStream.close();
                            } catch (Throwable th3) {
                                th2.addSuppressed(th3);
                            }
                        } else {
                            fileOutputStream.close();
                        }
                    }
                    TikaInputStream tikaInputStream = TikaInputStream.get(createTemporaryFile);
                    Throwable th4 = null;
                    try {
                        parse((InputStream) tikaInputStream, contentHandler, metadata, parseContext);
                        if (tikaInputStream != null) {
                            if (0 != 0) {
                                try {
                                    tikaInputStream.close();
                                } catch (Throwable th5) {
                                    th4.addSuppressed(th5);
                                }
                            } else {
                                tikaInputStream.close();
                            }
                        }
                        if (temporaryResources != null) {
                            if (0 == 0) {
                                temporaryResources.close();
                                return;
                            }
                            try {
                                temporaryResources.close();
                            } catch (Throwable th6) {
                                th.addSuppressed(th6);
                            }
                        }
                    } catch (Throwable th7) {
                        if (tikaInputStream != null) {
                            if (0 != 0) {
                                try {
                                    tikaInputStream.close();
                                } catch (Throwable th8) {
                                    th4.addSuppressed(th8);
                                }
                            } else {
                                tikaInputStream.close();
                            }
                        }
                        throw th7;
                    }
                } catch (Throwable th9) {
                    th2 = th9;
                    throw th9;
                }
            } catch (Throwable th10) {
                if (fileOutputStream != null) {
                    if (th2 != null) {
                        try {
                            fileOutputStream.close();
                        } catch (Throwable th11) {
                            th2.addSuppressed(th11);
                        }
                    } else {
                        fileOutputStream.close();
                    }
                }
                throw th10;
            }
        } catch (Throwable th12) {
            if (temporaryResources != null) {
                if (0 != 0) {
                    try {
                        temporaryResources.close();
                    } catch (Throwable th13) {
                        th.addSuppressed(th13);
                    }
                } else {
                    temporaryResources.close();
                }
            }
            throw th12;
        }
    }

    public void parse(InputStream inputStream, ContentHandler contentHandler, Metadata metadata, ParseContext parseContext) throws IOException, SAXException, TikaException {
        TesseractOCRConfig tesseractOCRConfig = (TesseractOCRConfig) parseContext.get(TesseractOCRConfig.class);
        TesseractOCRConfig tesseractOCRConfig2 = this.defaultConfig;
        if (tesseractOCRConfig != null) {
            tesseractOCRConfig2 = this.defaultConfig.cloneAndUpdate(tesseractOCRConfig);
        }
        if (this.hasTesseract) {
            if (tesseractOCRConfig2 == null || !tesseractOCRConfig2.isSkipOcr()) {
                TemporaryResources temporaryResources = new TemporaryResources();
                Throwable th = null;
                try {
                    try {
                        TikaInputStream tikaInputStream = TikaInputStream.get(inputStream, temporaryResources);
                        tikaInputStream.getPath();
                        File createTemporaryFile = temporaryResources.createTemporaryFile();
                        XHTMLContentHandler xHTMLContentHandler = new XHTMLContentHandler(contentHandler, metadata);
                        xHTMLContentHandler.startDocument();
                        parse(tikaInputStream, createTemporaryFile, xHTMLContentHandler, metadata, parseContext, tesseractOCRConfig2);
                        xHTMLContentHandler.endDocument();
                        if (temporaryResources != null) {
                            if (0 == 0) {
                                temporaryResources.close();
                                return;
                            }
                            try {
                                temporaryResources.close();
                            } catch (Throwable th2) {
                                th.addSuppressed(th2);
                            }
                        }
                    } catch (Throwable th3) {
                        th = th3;
                        throw th3;
                    }
                } catch (Throwable th4) {
                    if (temporaryResources != null) {
                        if (th != null) {
                            try {
                                temporaryResources.close();
                            } catch (Throwable th5) {
                                th.addSuppressed(th5);
                            }
                        } else {
                            temporaryResources.close();
                        }
                    }
                    throw th4;
                }
            }
        }
    }

    /* JADX WARN: Multi-variable type inference failed */
    /* JADX WARN: Type inference failed for: r0v59 */
    /* JADX WARN: Type inference failed for: r0v60, types: [java.lang.Throwable] */
    /* JADX WARN: Type inference failed for: r19v0, types: [java.lang.Throwable, java.io.InputStream] */
    /* JADX WARN: Type inference failed for: r19v1 */
    /* JADX WARN: Type inference failed for: r19v2 */
    /* JADX WARN: Type inference failed for: r20v3, types: [java.lang.Throwable] */
    private void parse(TikaInputStream tikaInputStream, File file, ContentHandler contentHandler, Metadata metadata, ParseContext parseContext, TesseractOCRConfig tesseractOCRConfig) throws IOException, SAXException, TikaException {
        warnOnFirstParse();
        validateLangString(tesseractOCRConfig.getLanguage());
        File file2 = null;
        try {
            Path path = tikaInputStream.getPath();
            long length = tikaInputStream.getLength();
            if (length >= tesseractOCRConfig.getMinFileSizeToOcr() && length <= tesseractOCRConfig.getMaxFileSizeToOcr()) {
                if (!tesseractOCRConfig.isEnableImagePreprocessing() && !tesseractOCRConfig.isApplyRotation()) {
                    doOCR(path.toFile(), file, tesseractOCRConfig, parseContext);
                } else if (this.hasImageMagick) {
                    TemporaryResources temporaryResources = new TemporaryResources();
                    ?? r19 = 0;
                    try {
                        try {
                            Path createTempFile = temporaryResources.createTempFile();
                            Files.copy(path, createTempFile, StandardCopyOption.REPLACE_EXISTING);
                            this.imagePreprocessor.process(createTempFile, createTempFile, metadata, tesseractOCRConfig);
                            doOCR(createTempFile.toFile(), file, tesseractOCRConfig, parseContext);
                            Path path2 = createTempFile;
                            if (temporaryResources != null) {
                                if (0 != 0) {
                                    try {
                                        temporaryResources.close();
                                        path2 = createTempFile;
                                    } catch (Throwable th) {
                                        r19.addSuppressed(th);
                                        path2 = th;
                                    }
                                } else {
                                    temporaryResources.close();
                                    path2 = createTempFile;
                                }
                            }
                        } catch (Throwable th2) {
                            r19 = th2;
                            throw th2;
                        }
                    } finally {
                    }
                } else {
                    LOG.warn("User has selected to preprocess images, but I can't find ImageMagick.Backing off to original file.");
                    doOCR(path.toFile(), file, tesseractOCRConfig, parseContext);
                }
                file2 = new File(file.getAbsolutePath() + "." + (tesseractOCRConfig.getPageSegMode().equals("0") ? "osd" : tesseractOCRConfig.getOutputType().toString().toLowerCase(Locale.US)));
                if (file2.exists()) {
                    try {
                        FileInputStream fileInputStream = new FileInputStream(file2);
                        Throwable th3 = null;
                        if (tesseractOCRConfig.getPageSegMode().equals("0")) {
                            extractOSD(fileInputStream, metadata);
                        } else if (tesseractOCRConfig.getOutputType().equals(TesseractOCRConfig.OUTPUT_TYPE.HOCR)) {
                            extractHOCROutput(fileInputStream, parseContext, contentHandler);
                        } else {
                            extractOutput(fileInputStream, contentHandler);
                        }
                        if (fileInputStream != null) {
                            if (0 != 0) {
                                try {
                                    fileInputStream.close();
                                } catch (Throwable th4) {
                                    th3.addSuppressed(th4);
                                }
                            } else {
                                fileInputStream.close();
                            }
                        }
                    } finally {
                    }
                }
            }
        } finally {
            if (file2 != null) {
                file2.delete();
            }
        }
    }

    /* JADX WARN: Can't fix incorrect switch cases order, some code will duplicate */
    /* JADX WARN: Code restructure failed: missing block: B:29:0x00ef, code lost:
    
        switch(r17) {
            case 0: goto L30;
            case 1: goto L31;
            case 2: goto L32;
            case 3: goto L33;
            case 4: goto L34;
            case 5: goto L35;
            default: goto L36;
        };
     */
    /* JADX WARN: Code restructure failed: missing block: B:30:0x0114, code lost:
    
        r9.set(org.apache.tika.parser.ocr.TesseractOCRParser.PSM0_PAGE_NUMBER, java.lang.Integer.parseInt(r0));
     */
    /* JADX WARN: Code restructure failed: missing block: B:33:0x0123, code lost:
    
        r9.set(org.apache.tika.parser.ocr.TesseractOCRParser.PSM0_ORIENTATION, java.lang.Integer.parseInt(r0));
     */
    /* JADX WARN: Code restructure failed: missing block: B:35:0x0132, code lost:
    
        r9.set(org.apache.tika.parser.ocr.TesseractOCRParser.PSM0_ROTATE, java.lang.Integer.parseInt(r0));
     */
    /* JADX WARN: Code restructure failed: missing block: B:37:0x0141, code lost:
    
        r9.set(org.apache.tika.parser.ocr.TesseractOCRParser.PSM0_ORIENTATION_CONFIDENCE, java.lang.Double.parseDouble(r0));
     */
    /* JADX WARN: Code restructure failed: missing block: B:39:0x0150, code lost:
    
        r9.set(org.apache.tika.parser.ocr.TesseractOCRParser.PSM0_SCRIPT, r0);
     */
    /* JADX WARN: Code restructure failed: missing block: B:41:0x015c, code lost:
    
        r9.set(org.apache.tika.parser.ocr.TesseractOCRParser.PSM0_SCRIPT_CONFIDENCE, java.lang.Double.parseDouble(r0));
     */
    /* JADX WARN: Code restructure failed: missing block: B:43:0x016b, code lost:
    
        org.apache.tika.parser.ocr.TesseractOCRParser.LOG.warn("I regret I don't know how to parse {} with value {}", r0, r0);
     */
    /*
        Code decompiled incorrectly, please refer to instructions dump.
        To view partially-correct add '--show-bad-code' argument
    */
    private void extractOSD(java.io.InputStream r8, org.apache.tika.metadata.Metadata r9) throws java.io.IOException {
        /*
            Method dump skipped, instructions count: 475
            To view this dump add '--comments-level debug' option
        */
        throw new UnsupportedOperationException("Method not decompiled: org.apache.tika.parser.ocr.TesseractOCRParser.extractOSD(java.io.InputStream, org.apache.tika.metadata.Metadata):void");
    }

    private void warnOnFirstParse() {
        if (hasWarned()) {
            return;
        }
        warn();
    }

    private void doOCR(File file, File file2, TesseractOCRConfig tesseractOCRConfig, ParseContext parseContext) throws IOException, TikaException {
        ArrayList arrayList = new ArrayList(Arrays.asList(getTesseractPath() + getTesseractProg(), file.getPath(), file2.getPath(), "--psm", tesseractOCRConfig.getPageSegMode()));
        if (!"0".equals(tesseractOCRConfig.getPageSegMode())) {
            if (!StringUtils.isBlank(tesseractOCRConfig.getLanguage())) {
                arrayList.add("-l");
                arrayList.add(tesseractOCRConfig.getLanguage());
            }
            for (Map.Entry<String, String> entry : tesseractOCRConfig.getOtherTesseractConfig().entrySet()) {
                arrayList.add("-c");
                arrayList.add(entry.getKey() + "=" + entry.getValue());
            }
            String[] strArr = new String[5];
            strArr[0] = "-c";
            strArr[1] = "page_separator=" + tesseractOCRConfig.getPageSeparator();
            strArr[2] = "-c";
            strArr[3] = tesseractOCRConfig.isPreserveInterwordSpacing() ? "preserve_interword_spaces=1" : "preserve_interword_spaces=0";
            strArr[4] = tesseractOCRConfig.getOutputType().name().toLowerCase(Locale.US);
            arrayList.addAll(Arrays.asList(strArr));
        }
        LOG.debug("Tesseract command: " + String.join(org.apache.commons.lang3.StringUtils.SPACE, arrayList));
        ProcessBuilder processBuilder = new ProcessBuilder(arrayList);
        setEnv(processBuilder);
        Process process = null;
        String str = null;
        long timeoutMillis = TikaTaskTimeout.getTimeoutMillis(parseContext, tesseractOCRConfig.getTimeoutSeconds() * 1000);
        try {
            process = processBuilder.start();
            str = register(process);
            runOCRProcess(process, timeoutMillis);
            if (process != null) {
                process.destroyForcibly();
            }
            if (str != null) {
                release(str);
            }
        } catch (Throwable th) {
            if (process != null) {
                process.destroyForcibly();
            }
            if (str != null) {
                release(str);
            }
            throw th;
        }
    }

    private void runOCRProcess(Process process, long j) throws IOException, TikaException {
        process.getOutputStream().close();
        InputStream inputStream = process.getInputStream();
        InputStream errorStream = process.getErrorStream();
        StringBuilder sb = new StringBuilder();
        StringBuilder sb2 = new StringBuilder();
        Thread logStream = logStream(inputStream, sb);
        Thread logStream2 = logStream(errorStream, sb2);
        logStream.start();
        logStream2.start();
        try {
            if (!process.waitFor(j, TimeUnit.MILLISECONDS)) {
                throw new TikaException("TesseractOCRParser timeout");
            }
            int exitValue = process.exitValue();
            if (exitValue > 0) {
                try {
                    logStream2.join(1000L);
                } catch (InterruptedException e) {
                }
                throw new TikaException("TesseractOCRParser bad exit value " + exitValue + " err msg: " + sb2.toString());
            }
        } catch (IllegalThreadStateException e2) {
            throw new TikaException("TesseractOCRParser timeout");
        } catch (InterruptedException e3) {
            Thread.currentThread().interrupt();
            throw new TikaException("TesseractOCRParser interrupted", e3);
        }
    }

    private void extractOutput(InputStream inputStream, ContentHandler contentHandler) throws SAXException, IOException {
        AttributesImpl attributesImpl = new AttributesImpl();
        attributesImpl.addAttribute("", "class", "class", "CDATA", "ocr");
        contentHandler.startElement(HTMLSerializer.XHTMLNamespace, "div", "div", attributesImpl);
        InputStreamReader inputStreamReader = new InputStreamReader(inputStream, StandardCharsets.UTF_8);
        Throwable th = null;
        try {
            try {
                char[] cArr = new char[1024];
                for (int read = inputStreamReader.read(cArr); read != -1; read = inputStreamReader.read(cArr)) {
                    if (read > 0) {
                        contentHandler.characters(cArr, 0, read);
                    }
                }
                if (inputStreamReader != null) {
                    if (0 != 0) {
                        try {
                            inputStreamReader.close();
                        } catch (Throwable th2) {
                            th.addSuppressed(th2);
                        }
                    } else {
                        inputStreamReader.close();
                    }
                }
                contentHandler.endElement(HTMLSerializer.XHTMLNamespace, "div", "div");
            } finally {
            }
        } catch (Throwable th3) {
            if (inputStreamReader != null) {
                if (th != null) {
                    try {
                        inputStreamReader.close();
                    } catch (Throwable th4) {
                        th.addSuppressed(th4);
                    }
                } else {
                    inputStreamReader.close();
                }
            }
            throw th3;
        }
    }

    private void extractHOCROutput(InputStream inputStream, ParseContext parseContext, ContentHandler contentHandler) throws TikaException, IOException, SAXException {
        if (parseContext == null) {
            parseContext = new ParseContext();
        }
        AttributesImpl attributesImpl = new AttributesImpl();
        attributesImpl.addAttribute("", "class", "class", "CDATA", "ocr");
        contentHandler.startElement(HTMLSerializer.XHTMLNamespace, "div", "div", attributesImpl);
        XMLReaderUtils.parseSAX(inputStream, new HOCRPassThroughHandler(contentHandler), parseContext);
        contentHandler.endElement(HTMLSerializer.XHTMLNamespace, "div", "div");
    }

    private Thread logStream(InputStream inputStream, StringBuilder sb) {
        return new Thread(() -> {
            InputStreamReader inputStreamReader = new InputStreamReader(inputStream, StandardCharsets.UTF_8);
            char[] cArr = new char[1024];
            try {
                for (int read = inputStreamReader.read(cArr); read != -1; read = inputStreamReader.read(cArr)) {
                    sb.append(cArr, 0, read);
                }
                IOUtils.closeQuietly(inputStream);
            } catch (IOException e) {
                IOUtils.closeQuietly(inputStream);
            } catch (Throwable th) {
                IOUtils.closeQuietly(inputStream);
                throw th;
            }
            LOG.debug("{}", sb);
        });
    }

    public void initialize(Map<String, Param> map) throws TikaConfigException {
        this.hasTesseract = hasTesseract();
        this.hasImageMagick = hasImageMagick();
        if (this.preloadLangs) {
            preloadLangs();
            if (!StringUtils.isBlank(this.defaultConfig.getLanguage())) {
                validateLangString(this.defaultConfig.getLanguage());
            }
        }
        this.imagePreprocessor = new ImagePreprocessor(getImageMagickPath() + getImageMagickProg());
    }

    private void validateLangString(String str) throws TikaConfigException {
        HashSet hashSet = new HashSet();
        HashSet<String> hashSet2 = new HashSet();
        TesseractOCRConfig.getLangs(str, hashSet2, hashSet);
        if (hashSet.size() > 0) {
            throw new TikaConfigException("Invalid language code(s): " + hashSet);
        }
        if (this.langs.size() > 0) {
            for (String str2 : hashSet2) {
                if (!this.langs.contains(str2)) {
                    throw new TikaConfigException("tesseract does not have " + str2 + " available. I see only: " + this.langs);
                }
            }
        }
    }

    public void checkInitialization(InitializableProblemHandler initializableProblemHandler) throws TikaConfigException {
        if (this.langs.size() > 0 && !StringUtils.isBlank(this.defaultConfig.getLanguage()) && !this.langs.contains(this.defaultConfig.getLanguage())) {
            throw new TikaConfigException("It doesn't look like tesseract has lang data for " + this.defaultConfig.getLanguage() + ". I see only: " + this.langs);
        }
    }

    public Set<String> getLangs() {
        return this.langs;
    }

    protected boolean hasWarned() {
        if (HAS_WARNED) {
            return true;
        }
        synchronized (LOCK) {
            return HAS_WARNED;
        }
    }

    protected void warn() {
        LOG.info("Tesseract is installed and is being invoked. This can add greatly to processing time.  If you do not want tesseract to be applied to your files see: https://cwiki.apache.org/confluence/display/TIKA/TikaOCR#TikaOCR-disable-ocr");
        HAS_WARNED = true;
    }

    public String getTesseractPath() {
        return this.tesseractPath;
    }

    @Field
    public void setTesseractPath(String str) {
        String normalize = FilenameUtils.normalize(str);
        if (!normalize.isEmpty() && !normalize.endsWith(File.separator)) {
            normalize = normalize + File.separator;
        }
        this.tesseractPath = normalize;
    }

    public String getTessdataPath() {
        return this.tessdataPath;
    }

    @Field
    public void setTessdataPath(String str) {
        String normalize = FilenameUtils.normalize(str);
        if (!normalize.isEmpty() && !normalize.endsWith(File.separator)) {
            normalize = normalize + File.separator;
        }
        this.tessdataPath = normalize;
    }

    public String getImageMagickPath() {
        return this.imageMagickPath;
    }

    @Field
    public void setImageMagickPath(String str) {
        String normalize = FilenameUtils.normalize(str);
        if (!normalize.isEmpty() && !normalize.endsWith(File.separator)) {
            normalize = normalize + File.separator;
        }
        this.imageMagickPath = normalize;
    }

    @Field
    public void setOtherTesseractSettings(List<String> list) throws TikaConfigException {
        Iterator<String> it = list.iterator();
        while (it.hasNext()) {
            String[] split = it.next().trim().split("\\s+");
            if (split.length != 2) {
                throw new TikaConfigException("Expected space delimited key value pair. However, I found " + split.length + " bits.");
            }
            this.defaultConfig.addOtherTesseractConfig(split[0], split[1]);
        }
    }

    @Field
    public void setSkipOCR(boolean z) {
        this.defaultConfig.setSkipOcr(z);
    }

    @Field
    public void setLanguage(String str) {
        this.defaultConfig.setLanguage(str);
    }

    @Field
    public void setPageSegMode(String str) {
        this.defaultConfig.setPageSegMode(str);
    }

    @Field
    public void setMaxFileSizeToOcr(long j) {
        this.defaultConfig.setMaxFileSizeToOcr(j);
    }

    @Field
    public void setMinFileSizeToOcr(long j) {
        this.defaultConfig.setMinFileSizeToOcr(j);
    }

    @Field
    public void setTimeout(int i) {
        this.defaultConfig.setTimeoutSeconds(i);
    }

    @Field
    public void setOutputType(String str) {
        this.defaultConfig.setOutputType(str);
    }

    @Field
    public void setPreserveInterwordSpacing(boolean z) {
        this.defaultConfig.setPreserveInterwordSpacing(z);
    }

    @Field
    public void setEnableImagePreprocessing(boolean z) {
        this.defaultConfig.setEnableImagePreprocessing(z);
    }

    @Field
    public void setDensity(int i) {
        this.defaultConfig.setDensity(i);
    }

    @Field
    public void setDepth(int i) {
        this.defaultConfig.setDepth(i);
    }

    @Field
    public void setColorspace(String str) {
        this.defaultConfig.setColorspace(str);
    }

    @Field
    public void setFilter(String str) {
        this.defaultConfig.setFilter(str);
    }

    @Field
    public void setResize(int i) {
        this.defaultConfig.setResize(i);
    }

    @Field
    public void setApplyRotation(boolean z) {
        this.defaultConfig.setApplyRotation(z);
    }

    @Field
    public void setPreloadLangs(boolean z) {
        this.preloadLangs = z;
    }

    public TesseractOCRConfig getDefaultConfig() {
        return this.defaultConfig;
    }

    private void preloadLangs() {
        ProcessBuilder processBuilder = new ProcessBuilder(getTesseractPath() + getTesseractProg(), "--list-langs");
        setEnv(processBuilder);
        Process process = null;
        try {
            try {
                process = processBuilder.start();
                getLangs(process, this.defaultConfig.getTimeoutSeconds());
                if (process != null) {
                    process.destroyForcibly();
                }
            } catch (TikaException | IOException e) {
                LOG.warn("Problem preloading langs", e);
                if (process != null) {
                    process.destroyForcibly();
                }
            }
        } catch (Throwable th) {
            if (process != null) {
                process.destroyForcibly();
            }
            throw th;
        }
    }

    private void getLangs(Process process, int i) throws IOException, TikaException {
        process.getOutputStream().close();
        InputStream inputStream = process.getInputStream();
        InputStream errorStream = process.getErrorStream();
        StringBuilder sb = new StringBuilder();
        StringBuilder sb2 = new StringBuilder();
        Thread logStream = logStream(inputStream, sb);
        Thread logStream2 = logStream(errorStream, sb2);
        logStream.start();
        logStream2.start();
        try {
            if (!process.waitFor(i, TimeUnit.SECONDS)) {
                throw new TikaException("TesseractOCRParser timeout");
            }
            int exitValue = process.exitValue();
            logStream.join(1000L);
            if (exitValue > 0) {
                throw new TikaException("TesseractOCRParser bad exit value " + exitValue + " err msg: " + sb2.toString());
            }
            for (String str : sb.toString().split("[\r\n]+")) {
                if (!str.startsWith("List of available")) {
                    this.langs.add(str.trim());
                }
            }
        } catch (IllegalThreadStateException e) {
            throw new TikaException("TesseractOCRParser timeout");
        } catch (InterruptedException e2) {
            Thread.currentThread().interrupt();
            throw new TikaException("TesseractOCRParser interrupted", e2);
        }
    }
}
