"""Force OCR pipelet."""

import subprocess  # noqa: S404
from pathlib import Path
from typing import TYPE_CHECKING

from octopus.text import sanitize_text
from squirro.common.config import get_config
from squirro.lib.storage.handler import StorageHandler
from squirro.sdk import PipeletV1, require

if TYPE_CHECKING:
    from logging import Logger
    from typing import Any


@require("log")
class ForceOCRPipelet(PipeletV1):  # type: ignore[misc]
    """Force OCR pipelet.

    This pipelet checks if the item has more than a certain number of
    words. If it doesn't, it forces the item to be OCR-ed.
    """

    _min_words: int
    log: "Logger"

    def __init__(self, config: "dict[str, Any]") -> None:
        """Initialize the pipelet."""
        self._min_words = config.get("min_words", 50)

    def consume(self, item: "dict[str, Any]") -> "dict[str, Any]":
        """Consume an item.

        Args:
            item: The item to consume

        Returns:
            The consumed item
        """
        self.log.info("Checking whether to force OCR")

        pdf_content: str | None = self._get_pdf_content(item)
        if not pdf_content:
            self.log.info("No PDF content found")
            return item

        if force_ocr := self._evaluate_force_ocr(pdf_content):
            self.log.info("Forcing OCR on item %s", item["id"])

        item["pdf_ocr:force_ocr"] = force_ocr
        item["keywords"]["force_ocr"] = ["true" if force_ocr else "false"]
        return item

    def _evaluate_force_ocr(self, txt: str) -> bool:
        """Evaluate if force ocr is needed.

        Args:
            txt: The text to evaluate

        Returns:
            Whether force ocr is needed
        """
        txt = sanitize_text(txt)
        return len(txt.split()) < self._min_words

    def _get_pdf_content(self, item: "dict[str, Any]") -> "str | None":
        for file in item.get("files", []):
            if file.get("mime_type", "") == "application/pdf":
                content_url = file.get("content_url", "")
                break
        else:
            return None

        config = get_config("squirro.lib.storage")

        try:
            bucket, _, path = StorageHandler(config)._parse_url(content_url)  # noqa: SLF001
            full_path = str(Path(config.get(f"storage_{bucket}", "directory")) / path)
        except Exception:
            self.log.exception("Failed to parse content URL.")
            raise

        args = ["pdftotext", "-f", "1", "-l", "5", full_path, "-"]
        try:
            self.log.info("Reading PDF from %s", full_path)

            ret = subprocess.run(  # noqa: S603
                args, capture_output=True, check=True
            )
            content = ret.stdout.decode()
        except Exception:
            self.log.exception("Failed to extract text from PDF.")
            raise

        return content

    @staticmethod
    # pylint: disable-next=invalid-name
    def getArguments() -> "list[dict[str, Any]]":  # noqa: N802
        """Return a list of arguments that are configurable from the UI.

        Returns:
            List of arguments
        """
        return [
            {
                "name": "min_words",
                "display_label": "Minimum number of words",
                "type": "int",
                "default": 50,
            }
        ]
