"""Duplicate handling pipelet."""

from typing import TYPE_CHECKING

from octopus.clients import init_redis_client, init_squirro_client
from octopus.utils import compute_hash, load_config
from squirro.common.config import get_config
from squirro.lib.storage.handler import StorageHandler
from squirro.sdk import PipeletV1, require

if TYPE_CHECKING:
    from io import BufferedReader
    from logging import Logger
    from typing import Any


def _load_file(item: "dict[str, Any]") -> "BufferedReader":
    """Read the document from the storage.

    Args:
        item: The item to read the file from

    Returns:
        The file as a bytes buffer
    """
    storage = StorageHandler(get_config("squirro.lib.storage"))
    bytes_io: BufferedReader = storage.open(item["files"][0]["content_url"])
    return bytes_io


@require("log")
class DuplicateHandlingPipelet(PipeletV1):  # type: ignore[misc]
    """Duplicate handling pipelet.

    This pipelet computes a binary hash for the given document and
    checks if there is already a document with the same hash in the
    project. If there is, the document is rejected.
    """

    item_url = ""
    log: "Logger"
    project_id: str
    url: str

    def __init__(self, _: "dict[str, Any]") -> None:
        """Initialize the pipelet."""
        self.sq_client, self.project_id = init_squirro_client()
        self.redis_client = init_redis_client()
        self.url = load_config()["squirro"]["url"]

    def consume(self, item: "dict[str, Any]") -> "dict[str, Any] | None":
        """Consume an item.

        Args:
            item: The item to consume

        Returns:
            The consumed item
        """
        self.log.info("Checking for duplicates.")

        # Read the document from the storage
        try:
            bytes_io = _load_file(item)
        except Exception:
            self.log.exception("Could not read file from storage.")
            raise

        # Compute the hash
        if "skip_binary_hash_compute" in item["keywords"]:
            binary_hash = item["keywords"]["binary_hash"][0]
            item["keywords"].pop("skip_binary_hash_compute")
        else:
            binary_hash = compute_hash(bytes_io)
            binary_hashes = set(item["keywords"].get("binary_hash", []))
            binary_hashes.add(binary_hash)
            item["keywords"]["binary_hash"] = list(binary_hashes)

        # Check for duplicates
        if not (duplicated_id := self.check_for_duplicates(binary_hash)):
            self.log.info("No duplicates found.")
            return item

        # Log the duplicated item
        self._log_duplicated_item(
            duplicated_id,
            item,
        )
        return None

    def check_for_duplicates(self, digest: str) -> str | None:
        """Check if there is duplicates.

        Document is a duplicate if there is a document with the given hash
        which is not deleted.

        Args:
            digest: The hash

        Returns:
            The duplicate id and a boolean indicating if the duplicate is deleted
        """
        result: dict[str, Any] = self.sq_client.query(
            self.project_id,
            query=f"binary_hash:{digest} -is_deleted:true",
        )
        items = result.get("items", [])
        if not items:
            return None

        iid: str = items[0]["id"]
        return iid

    def _log_duplicated_item(self, duplicated_id: str, item: "dict[str, Any]") -> None:
        """Log the duplicated item."""
        msg = (
            f"The document you uploaded is a duplicate of {duplicated_id} and "
            "has been rejected. This is the link to the original document: "
            f"{self.url}/app/#dashboard/{self.project_id}?"
            f"modal-item-id={duplicated_id}"
        )
        self.log.warning(msg)

        if item["keywords"].get("source_type", [""])[0].startswith("WFI"):
            self.redis_client.hset("duplicated_item_wfi_hash", item["id"], msg)
        else:
            self.redis_client.hset(
                "duplicated_item_hash",
                item["id"],
                msg,
            )
