package com.atlassian.confluence.plugins.extractor;

import com.atlassian.bonnie.search.SearchableAttachment;
import com.atlassian.bonnie.search.extractor.BaseAttachmentContentExtractor;
import com.atlassian.bonnie.search.extractor.ExtractorException;
import com.atlassian.bonnie.search.extractor.util.AbstractLengthLimitedStringBuilder;
import com.atlassian.bonnie.search.extractor.util.LimitReachedException;
import com.atlassian.bonnie.search.extractor.util.StaticLengthLimitedStringBuilder;
import com.atlassian.bonnie.search.extractor.util.StringBuilderWriter;
import com.atlassian.confluence.index.attachment.AttachmentTextExtractor;
import com.atlassian.confluence.pages.Attachment;
import com.atlassian.confluence.util.io.InputStreamSource;
import org.apache.commons.io.IOUtils;
import org.apache.lucene.document.Document;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.pdmodel.encryption.InvalidPasswordException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.InputStream;
import java.nio.charset.StandardCharsets;
import java.util.Arrays;
import java.util.List;
import java.util.Optional;

public class PdfContentExtractor extends BaseAttachmentContentExtractor implements AttachmentTextExtractor
{
    private static final Logger log = LoggerFactory.getLogger(PdfContentExtractor.class);

    private static final String[] EXTENSIONS = { "pdf" };
    private static final String[] CONTENT_TYPES = { "application/pdf" };
    private static final int DEFAULT_MAX_RESULT_SIZE = 8 * 1024 * 1024; // 8 MB
    private final int maxResultSize;

    public PdfContentExtractor()
    {
        this(DEFAULT_MAX_RESULT_SIZE);
    }

    public PdfContentExtractor(int maxResultSize)
    {
        super();
        this.maxResultSize = maxResultSize;
    }

    protected String[] getMatchingContentTypes()
    {
        return CONTENT_TYPES;
    }

    protected String[] getMatchingFileExtensions()
    {
        return EXTENSIONS;
    }

    /*
     * customized copy of the LucenePDFDocument.addContent() in PDFBox
     */
    protected String extractText(InputStream is, SearchableAttachment attachment) throws ExtractorException
    {
        PDDocument pdfDocument = null;
        try
        {
            pdfDocument = PDDocument.load(is);
            PDFTextStripper stripper = new PDFTextStripper();
            StringBuilderWriter writer = new StringBuilderWriter(new StaticLengthLimitedStringBuilder(maxResultSize / 2, AbstractLengthLimitedStringBuilder.LIMIT_BEHAVIOUR.THROW)); // 1 char == 2 bytes
            try
            {
                stripper.writeText(pdfDocument, writer);
            }
            catch (LimitReachedException e)
            {
                // We got enough data
                log.debug("Reached maximum result length of {} bytes", maxResultSize);
            }
            finally
            {
                writer.close();
            }

            return writer.toString();
        }
        catch (InvalidPasswordException e)
        {
            //they didn't suppply a password and the default of "" was wrong.
            throw new ExtractorException("Password required for encrypted PDF document", e);
        }
        catch (Exception e)
        {
            throw new ExtractorException("Error getting content of PDF document", e);
        }
        finally
        {
            if (pdfDocument != null) try { pdfDocument.close(); } catch (Exception e) {}
        }
    }

    @Override
    public List<String> getFileExtensions() {
        return Arrays.asList(EXTENSIONS);
    }

    @Override
    public List<String> getMimeTypes() {
        return Arrays.asList(CONTENT_TYPES);
    }

    @Override
    public Optional<InputStreamSource> extract(Attachment attachment) {
        Document document = new Document();
        StringBuffer buffer = new StringBuffer();
        addFields(document, buffer, attachment);
        return Optional.of(() -> IOUtils.toInputStream(buffer.toString(), StandardCharsets.UTF_8));
    }
}
