from io import BytesIO
from pathlib import Path
from typing import TYPE_CHECKING

from msoffcrypto import OfficeFile
from pdfminer.pdfdocument import PDFPasswordIncorrect

from octopus.activity_tracking import RejectionReason
from octopus.clients import init_redis_client, init_squirro_client
from octopus.pdf import check_password_protected_pdf
from octopus.utils import get_storage_url_full_path
from squirro.lib.convert.office_pdf import OfficePdfConverter
from squirro.sdk import PipeletV1, require

if TYPE_CHECKING:
    from logging import Logger
    from typing import Any, Dict, List

_MS_MIME_TYPE_MAP = [
    "application/msword",
    "application/vnd.ms-excel",
    "application/vnd.ms-powerpoint",
    "application/vnd.ms-excel.sheet.macroenabled.12",
    "application/vnd.ms-excel.template.macroenabled.12",
    "application/vnd.ms-powerpoint.presentation.macroenabled.12",
    "application/vnd.ms-powerpoint.slideshow.macroenabled.12",
    "application/vnd.ms-powerpoint.template.macroenabled.12",
    "application/vnd.ms-word.document.macroenabled.12",
    "application/vnd.ms-word.template.macroenabled.12",
    "application/vnd.ms-works",
    "application/vnd.openxmlformats-officedocument.presentationml.presentation",
    "application/vnd.openxmlformats-officedocument.presentationml.slideshow",
    "application/vnd.openxmlformats-officedocument.presentationml.template",
    "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
    "application/vnd.openxmlformats-officedocument.spreadsheetml.template",
    "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
    "application/vnd.openxmlformats-officedocument.wordprocessingml.template",
]


@require("log")
# pylint: disable=too-few-public-methods
class FilterItemsPipelet(PipeletV1):  # type: ignore[misc]
    """Filter out items that do not conform to requirements defined by
    arguments.

    This pipelet can filter out items without any files or with files
    exceeding provided size, not supported MIME types or whose sender's
    email is not whitelisted.
    """

    _filter_file_size: int
    log: "Logger"
    _max_file_size = 50  # MB
    _redis_hash = "item_rejection_hash"
    _supported_mime_types = OfficePdfConverter.SUPPORTED_MIME_TYPES + [
        "application/pdf",
        "application/vnd.ms-outlook",
        "image/jpeg",
        "image/png",
        "image/tiff",
    ]

    def __init__(self, config: "Dict[str, Any]") -> None:
        max_file_size = config.get("filter_exceeding_size_files", self._max_file_size)
        self._filter_file_size = min(max_file_size, self._max_file_size) * 1_000_000
        self.sq_client, self.project_id = init_squirro_client()
        self._filter_whitelisted_senders = config.get("filter_whitelisted_senders")
        self._whitelisted_emails = self._get_whitelisted_emails()
        self.redis_client = init_redis_client()

    def consume(self, item: "Dict[str, Any]") -> "Dict[str, Any] | None":
        """Consume an item."""
        if not self._is_valid_item(item):
            return None

        return item

    def _is_password_protected(self, fp: Path, mime_type: "str") -> "bool":
        if mime_type == "application/pdf":
            try:
                with fp.open("rb") as f:
                    return check_password_protected_pdf(BytesIO(f.read()))
            except PDFPasswordIncorrect:
                return True
        elif mime_type in _MS_MIME_TYPE_MAP:
            with fp.open("rb") as f:
                protected: bool = OfficeFile(f).is_encrypted()
            return protected  # mypy unable to infer if return directly
        elif mime_type == "application/encrypted":
            return True
        else:
            self.log.warning(
                "Skipping password protection check for file %s with mime type %s",
                str(fp),
                mime_type,
            )
        return False

    # pylint: disable=too-many-return-statements
    def _is_valid_item(self, item: "Dict[str, Any]") -> bool:
        item_id = item.get("id", "")
        files: "List[Dict[str, Any]]" = item.get("files", [])

        if item["keywords"].get("source_type", [""])[0].startswith("WFI"):
            self._redis_hash = "item_wfi_rejection_hash"

        if isinstance(files, list) and not files:
            self.log.warning(f"Rejecting item `{item_id}`: item has no files")
            self._add_item_rejection(item_id, RejectionReason.ITEM_HAS_NO_FILE)
            return False

        # Currently each Squirro item can only refer to one file.
        # https://docs.squirro.com/en/latest/technical/pipelets/how-to/how-access.html#concept
        file = files[0]
        file_id = file.get("id", "")
        mime_type = file.get("mime_type", "")
        path = get_storage_url_full_path(file.get("content_url", ""))

        if not item_id:
            self.log.warning("Rejecting item `%s`: item has no ID", item)
            path.unlink(missing_ok=True)
            return False

        if not path.is_file():
            self.log.warning(
                "Rejecting item `%s`: file %s not found under path `%s`",
                item_id,
                file_id,
                path,
            )
            self._add_item_rejection(item_id, RejectionReason.FILE_NOT_FOUND)
            return False

        if self._is_password_protected(path, mime_type):
            self.log.warning(
                "Rejecting item `%s`: file `%s` is password-protected",
                item_id,
                file_id,
            )
            self._add_item_rejection(item_id, RejectionReason.PASSWORD_PROTECTED)
            path.unlink(missing_ok=True)
            return False

        if path.stat().st_size > self._filter_file_size:
            self.log.warning(
                f"Rejecting item `{item_id}`: file `{file_id}` exceeds maximum size"
            )
            self._add_item_rejection(item_id, RejectionReason.FILE_TOO_LARGE)
            path.unlink(missing_ok=True)
            return False

        if file.get("mime_type", "") not in self._supported_mime_types:
            self.log.warning(
                "Rejecting item `%s`: file `%s` MIME type is not supported",
                item_id,
                file_id,
            )
            self._add_item_rejection(item_id, RejectionReason.FILE_NOT_SUPPORTED)
            path.unlink(missing_ok=True)
            return False

        if (
            self._filter_whitelisted_senders
            and item.get("keywords", {}).get("source_type", [""])[0] == "Email"
            and item.get("keywords", {}).get("user_email", [""])[0].lower()
            not in self._whitelisted_emails
        ):
            self.log.warning(
                f"Rejecting item `{item_id}`: sender's email is not whitelisted"
            )
            self._add_item_rejection(item_id, RejectionReason.SENDER_NOT_WHITELISTED)
            path.unlink(missing_ok=True)
            return False

        return True

    def _add_item_rejection(self, item_id: "str", rejection_id: str) -> None:
        self.log.warning(
            "Adding item `%s` with status code `%s` to Redis hash `%s`",
            item_id,
            rejection_id,
            self._redis_hash,
        )

        try:
            self.redis_client.hset(self._redis_hash, item_id, rejection_id)
        except Exception:
            self.log.exception("Writing to `%s` failed", self._redis_hash)
            raise

    def _get_whitelisted_emails(self) -> "List[str]":
        users: "Dict[str, Any]" = {}
        try:
            users = self.sq_client.get_users()
        except Exception:  # pylint: disable=broad-except
            self.log.exception("Getting users from Squirro failed")

        user_emails: "List[str]" = [
            user_email.lower()
            for user in users.get("users", [])
            if (user_email := user.get("email", ""))
        ]

        config: "Dict[str, Any]" = {}
        try:
            config = self.sq_client.get_project_configuration(self.project_id)
        except Exception:  # pylint: disable=broad-except
            self.log.exception("Getting project configuration from Squirro failed")

        config_emails: "List[str]" = [
            user_email.lower()
            for user_email in config.get("config", {})
            .get("app.whitelisted-emails", {})
            .get("value", {})
            .get("email_addresses", [])
            if user_email
        ]

        return user_emails + config_emails

    @staticmethod
    # pylint: disable-next=invalid-name
    def getArguments() -> "List[Dict[str, Any]]":  # noqa: N802 type: ignore[type-arg]
        """Return the arguments for the pipelet."""
        return [
            {
                "name": "filter_whitelisted_senders",
                "display_label": "Filter items from whitelisted email senders",
                "type": "bool",
                "default": True,
            },
            {
                "name": "filter_exceeding_size_files",
                "display_label": (
                    "Filter items with files exceeding provided size (MB)"
                ),
                "type": "int",
                "default": FilterItemsPipelet._max_file_size,
            },
        ]
