"""Store files into WFI or queue for retry."""

import json
import re
from pathlib import Path
from typing import TYPE_CHECKING

from requests.exceptions import HTTPError

from octopus.activity_tracking import RejectionReason
from octopus.clients import init_redis_client, init_squirro_client, init_wfi_client
from octopus.text import remove_special_characters
from squirro.common.config import get_config
from squirro.lib.storage.handler import StorageHandler
from squirro.sdk import PipeletV1, require
from squirro.service.pdfconversion.controllers import ConvertController

if TYPE_CHECKING:
    from logging import Logger
    from typing import Any

REDIS_FAILED_ITEMS_HASH = "failed_items_hash"
UNKNOWN = "UNKNOWN"
MAX_TITLE_LEN = 255
BAD_REQUEST = 400


# pylint: disable=too-few-public-methods
@require("log")
class StoreIntoWFI(PipeletV1):  # type: ignore[misc]
    """Store files into WFI or queue for retry.

    The pipelet gathers all information needed for WFI request
    (including default values for not existing fields) and filters out
    items that do not conform to WFI requirements. Then attempts to
    store the file in WFI or in case of any exception queues it for
    retry.
    """

    log: "Logger"

    def __init__(self, _: "dict[str, Any]") -> None:
        """Initialize the pipelet."""
        self.sq_client, self.project_id = init_squirro_client()
        self.wfi_client = init_wfi_client()
        self.redis_client = init_redis_client()
        self.storage_handler = StorageHandler(get_config("squirro.lib.storage"))
        self.item_rejection_hash = "item_rejection_hash"
        self.item_success_hash = "item_success_hash"

    def consume(self, item: "dict[str, Any]") -> "dict[str, Any] | None":
        """Consume an item.

        Args:
            item: The item to consume

        Returns:
            The consumed item
        """
        keywords: dict[str, Any] = item.get("keywords", {})

        is_initial_checkin = True
        if keywords.get("source_type", [""])[0].startswith("WFI"):
            # Item from WFI treated as initial checkin if unzipped
            is_initial_checkin = item.get("wfi:initial_checkin", False)
            self.item_rejection_hash = "item_wfi_rejection_hash"
            self.item_success_hash = "item_wfi_success_hash"

        file_to_store = self._get_file_to_store(item)

        if not file_to_store or not self._is_valid_item(item, file_to_store):
            return None

        new_document_id = ""
        try:
            if is_initial_checkin:
                # Normal check in
                self._set_default_values(item)
                new_document_id = self._store_item_into_wfi(item, file_to_store)
            else:
                # Check in as minor version if file was ocr-ed
                is_file_ocred = file_to_store.get("pdf_ocr:ocr_status", "") == "success"
                if is_file_ocred:
                    new_document_id = self._store_item_into_wfi(
                        item, file_to_store, initial_checkin=False
                    )
        except HTTPError as error:
            if error.response is not None and error.response.status_code == BAD_REQUEST:
                return None  # Reject item
            return item  # Item queued for retry

        # Keep the original document id if no check in
        if not new_document_id:
            new_document_id = keywords["wfi_document_id"][0]

        self._post_process_item(item, file_to_store, new_document_id)
        return item

    def _store_item_into_wfi(
        self,
        item: "dict[str, Any]",
        file: "dict[str, str]",
        *,
        initial_checkin: bool = True,
    ) -> str:
        """Prepare the payload and perform a check in to WFI.

        Args:
            item: Current item.
            file: File to check in.
            initial_checkin: Whether to perform check in as first time or minor version.

        Returns:
            The new ID of the checked in document

        Raises:
            HTTPError: If the check in fails
        """
        item_id: str = item["id"]
        document_id: str = item.get("keywords", {}).get("wfi_document_id", [""])[0]
        files, data = self._prepare_checkin_payload(
            item, file, initial_checkin=initial_checkin
        )

        checkin_msg = f"Checking in item {item_id}"
        if document_id:
            checkin_msg += f" (minor version) into WFI document id {document_id}"
        self.log.info(
            "%s\nPayload files %s: %s\nPayload data %s: %s",
            checkin_msg,
            item_id,
            files,
            item_id,
            data,
        )

        try:
            new_document_id: str = self.wfi_client.checkin_document(
                files,
                data,
                document_id=document_id,
            )
        except HTTPError as error:
            response = error.response
            if (
                response is not None and response.status_code == BAD_REQUEST
            ):  # No point retrying for BAD_REQUEST
                self.log.exception("WFI BAD_REQUEST BAD REQUEST")
                wfi_id = item["keywords"].get("wfi_id_original", [""])[0]
                redis_key = f"{item_id}_{wfi_id}" if wfi_id else item_id
                self._reject_item(redis_key, RejectionReason.WFI_INVALID_REQUEST, file)
                raise

            self._reject_item(item_id, RejectionReason.WFI_CHECKIN_FAILED)
            self._add_to_retry_redis(item, file, document_id)
            self.log.exception(
                "Uploading item `%s` to WFI failed, item is queued for retry",
                item_id,
            )
            raise

        self.log.info(
            "Item %s successfully stored into WFI %s", item_id, new_document_id
        )
        return new_document_id

    def _post_process_item(
        self,
        item: "dict[str, Any]",
        file_stored: "dict[str, str]",
        document_id: str,
    ) -> None:
        """Post process item after storing into WFI.

        Update item's labels, delete local file, modify content url after
        storing into WFI.

        Args:
            item: Current item.
            file_stored: The file that was stored into WFI.
            document_id: WFI document ID of the checked-in document.
        """
        item["keywords"]["wfi_document_id"] = [document_id]
        item["keywords"]["wfi_status"] = ["Success"]

        self.storage_handler.delete(file_stored["content_url"])
        self._modify_content_url(item, file_stored, document_id)
        self._add_success_entry(item)

    def _prepare_checkin_payload(
        self,
        item: "dict[str, Any]",
        file: "dict[str, str]",
        *,
        initial_checkin: bool,
    ) -> "tuple[dict[str, Any], dict[str, str]]":
        """Prepare the request payload to check in document into WFI.

        Args:
            item: Current item.
            file: File to check in.
            initial_checkin: Whether to perform check in as first time or minor version.

        Returns:
            A tuple containing the files and data to be used in request body.
        """
        allowed_chars = "_.-:,&?!'\u2019)( "
        title = remove_special_characters(item["title"], list(allowed_chars))

        item["title"] = title  # To be consistent with WFI

        properties = [
            {
                "propertyName": "DocumentTitle",
                "propertyDataType": "string",
                "value": title,
            }
        ]

        if references := item["keywords"].get("references", []):
            item["keywords"]["wfi_references"] = [";".join(references)]

        if initial_checkin:
            for label, metadata in self.wfi_client.FIELDS.items():
                wfi_metadata = metadata["wfi"]
                wfi_metadata.update(
                    {
                        "value": item.get("keywords", {}).get(
                            label, metadata["default"]
                        )[0]
                    }
                )
                properties.append(wfi_metadata)

        doc_props = {
            "className": "BBCADocuments",
            "propertyList": properties,
        }

        return (
            {"File": self.storage_handler.open(file["content_url"])},
            {"DocProps": json.dumps(doc_props)},
        )

    def _modify_content_url(
        self,
        item: "dict[str, Any]",
        file_stored: "dict[str, str]",
        document_id: "str",
    ) -> None:
        """Modify the content url that points to the storage plugin's url.

        This ensures that documents are fetched directly from WFI. If document
        needs to be converted to PDF on the fly, also modify the pdf_conversion
        url.

        Args:
            item: Current item.
            file_stored: The file that was stored into WFI.
            document_id: WFI document ID of the checked-in document.
        """
        files = item.get("files", [])

        content_url = f"storage://wfi/{self.project_id}/{document_id}"

        for file in files:
            if file.get("content_url", "").startswith("storage://pdf_conversion"):
                # pylint: disable=protected-access
                file["content_url"] = (
                    "storage://pdf_conversion/"
                    + ConvertController._pdf_path(content_url)  # noqa: SLF001
                )

            if file["id"] == file_stored["id"]:
                file["id"] = document_id
                file["content_url"] = content_url

    def _is_valid_item(self, item: "dict[str, Any]", file: "dict[str, str]") -> bool:
        """Check if the item is valid before storing into WFI.

        Args:
            item: Current item.
            file: File to check in.

        Returns:
            Whether the item is valid or not
        """
        item_id: str = item["id"]
        title: str = item["title"]
        content_url: str = file["content_url"]

        if len(title) > MAX_TITLE_LEN:
            self.log.warning(
                "Rejecting item `%s`: title `%s` is too long",
                item_id,
                title,
            )
            self._reject_item(item_id, RejectionReason.WFI_LABEL_TOO_LONG, file)
            return False

        if not self.storage_handler.exists(content_url):
            self.log.warning(
                "Rejecting item `%s`: file at `%s` does not exist",
                item_id,
                content_url,
            )
            self._reject_item(item_id, RejectionReason.FILE_NOT_FOUND, file)
            return False

        for label, metadata in self.wfi_client.FIELDS.items():
            values = item.get("keywords", {}).get(label, [])
            if isinstance(values, list) and len(values):
                value = values[0]

                if not isinstance(value, str):
                    self.log.warning(
                        "Rejecting item `%s`: value `%s` for keyword `%s` is not "
                        "string",
                        item_id,
                        value,
                        label,
                    )
                    self._reject_item(
                        item_id, RejectionReason.WFI_LABEL_NOT_SUPPORTED, file
                    )
                    return False

                if len(value) > metadata["length"]:
                    self.log.warning(
                        "Rejecting item `%s`: value `%s` for keyword `%s` is too long",
                        item_id,
                        value,
                        label,
                    )
                    self._reject_item(item_id, RejectionReason.WFI_LABEL_TOO_LONG, file)
                    return False
        return True

    def _set_default_values(self, item: "dict[str, Any]") -> None:
        """Set default values of the item's labels that relate to WFI.

        Args:
            item: Current item.
        """
        keywords: dict[str, Any] = item.get("keywords", {})

        for label, metadata in self.wfi_client.FIELDS.items():
            if label not in keywords:
                item["keywords"][label] = metadata["default"]

    def _get_file_to_store(self, item: "dict[str, Any]") -> "dict[str, str] | None":
        """Get the file to store into WFI.

        If OCR, get the OCR-ed version. If not, get the original file.

        Args:
            item: Current item.

        Returns:
            The file to store into WFI.
        """
        files: list[dict[str, str]] = item.get("files", [])
        localfiles = [
            file
            for file in files
            if (content_url := file.get("content_url", ""))
            and not re.search(r".preview.[^.]+$", content_url)
            and not content_url.startswith("storage://pdf_conversion/")
        ]

        if not localfiles:
            self.log.warning("Item %s has no localfile.", item["id"])
            self._reject_item(item["id"], RejectionReason.FILE_NOT_FOUND)
            return None

        # If OCR, item will have the OCR-ed version and the original file
        # Remove the original file locally and from item["files"]
        if len(localfiles) > 1:
            file_no_ocr = next(
                file
                for file in localfiles
                if file.get("mime_type", "") != "application/pdf"
            )
            self.storage_handler.delete(file_no_ocr["content_url"])

            files = [file for file in files if file.get("id", "") != file_no_ocr["id"]]
            item["files"] = files

        if ori := item.get("msg:ori"):
            # Delete converted .html
            Path(item["msg:html"]).unlink()
            localfiles[0]["content_url"] = ori
            localfiles[0]["mime_type"] = "application/vnd.ms-outlook"

        return localfiles[0]

    def _reject_item(
        self,
        item_id: str,
        rejection_id: str,
        file: "dict[str, str] | None" = None,
    ) -> None:
        """Reject item.

        Add rejected item to redis hash and delete the file on local
        filesystem.

        Args:
            item_id: Squirro item id.
            rejection_id: Rejection ID/code.
            file: File on local disk to remove.
        """
        msg = f"Rejecting item `{item_id}` with status code `{rejection_id}`"
        self.log.info(msg)

        try:
            self.redis_client.hset(self.item_rejection_hash, item_id, rejection_id)
            if file:
                self.storage_handler.delete(file["content_url"])
        except Exception:
            self.log.exception("%s failed", msg)
            raise

    def _add_success_entry(self, item: "dict[str, Any]") -> None:
        """Add the successfully ingested item to redis for activity tracking.

        Args:
            item: The successful item
        """
        item_id: str = item["id"]

        success_entry: dict[str, str] = {
            "id": item_id,
            "title": item["title"],
            "created_at": item["created_at"],
            "source_type": item["keywords"].get("source_type", [UNKNOWN])[0],
            "company_name": item["keywords"].get("company_name", [UNKNOWN]),
            "document_type": item["keywords"].get("document_type", [UNKNOWN])[0],
            "document_date": item["keywords"].get("document_date", [UNKNOWN])[0],
            "num_pages": item["keywords"].get("num_pages", [0])[0],
            "user_name": item["keywords"].get("user_name", [UNKNOWN])[0],
            "user_email": item["keywords"].get("user_email", [UNKNOWN])[0],
            "ingestion_status": "Success",
        }

        if wfi_id := item["keywords"].get("wfi_id_original", [""])[0]:
            success_entry["wfi_id_original"] = wfi_id

        if zip_reference := item["keywords"].get("zip_reference", [""])[0]:
            success_entry["zip_reference"] = zip_reference

        self.redis_client.hset(
            self.item_success_hash, item_id, json.dumps(success_entry)
        )

    def _add_to_retry_redis(
        self,
        item: "dict[str, Any]",
        original_file: "dict[str, str]",
        document_id: str,
    ) -> None:
        """Add the item to redis for retrying.

        Args:
            item: The item that failed.
            original_file: Original file to store into WFI.
            document_id: The WFI document ID if exists.
        """
        item_id = item.get("id", "")
        metadata = {
            "item": item,
            "original_file": original_file,
            "wfi_document_id": document_id,
        }

        message = f"Adding item `{item_id}` to Redis hash `{REDIS_FAILED_ITEMS_HASH}`"
        self.log.warning(message)

        try:
            self.redis_client.hset(
                REDIS_FAILED_ITEMS_HASH,
                original_file.get("content_url", ""),
                json.dumps(metadata),
            )
        except Exception:
            self.log.exception("%s failed", message)
            raise
