"""Extract key-value pairs from the email body."""

import re
from datetime import datetime
from typing import TYPE_CHECKING

import pytz

from squirro.sdk import PipeletV1, require

if TYPE_CHECKING:
    from logging import Logger
    from typing import Any


KEYS_TO_LABELS_MAPPING = {
    "uid": "company_uid",
    "document type": "document_type_true",
    "document date": "document_date_true",
    "reference": "reference_true",
    "audit viewing": "audit_viewing_true",
}
SUPPORTED_DATE_FORMATS = (
    "%d-%m-%Y",
    "%d/%m/%Y",
    "%Y%m%d",
    "%Y%m-%d",
    "%Y-%m%d",
    "%Y-%m-%d",
    "%Y%m\u2013%d",
    "%Y\u2013%m%d",
    "%Y\u2013%m\u2013%d",
    "%Y/%m/%d",
    "%y%m%d",
    "%y%m-%d",
    "%y-%m%d",
    "%y-%m-%d",
    "%y%m\u2013%d",
    "%y\u2013%m%d",
    "%y\u2013%m\u2013%d",
    "%y/%m/%d",
)


def parse_document_date(date_string: str) -> str:
    """Parse the document date.

    Args:
        date_string: The document date to parse

    Returns:
        The parsed document date

    Raises:
        ValueError: If the document date is not supported
    """
    for date_format in SUPPORTED_DATE_FORMATS:
        try:
            document_date = datetime.strptime(date_string, date_format).replace(
                tzinfo=pytz.timezone("Asia/Singapore")
            )
        except ValueError:
            continue
        return document_date.strftime("%Y-%m-%d")

    msg = f"Document date `{date_string}` is not supported"
    raise ValueError(msg)


def parse_email_body(email_body: "str") -> "dict[str, Any]":
    """Parse the email body.

    Args:
        email_body: The email body to parse

    Returns:
        A dictionary with the parsed key-value pairs
    """
    return {
        match.group("key"): [
            stripped_value
            for value in match.group("value").split(";")
            if (stripped_value := value.strip())
        ]
        for match in re.finditer("(?P<key>.*): (?P<value>[^\r]*)", email_body)
    }


@require("log")
class TagEmailMetadata(PipeletV1):  # type: ignore[misc]
    """Extract key-value pairs from the email body.

    The pipelet matches key-value pairs of the format `key_1: value_1;
    value_2` in the email body and maps them to item labels.
    """

    log: "Logger"

    def consume(self, item: "dict[str, Any]") -> "dict[str, Any]":
        """Consume an item.

        Args:
            item: The item to consume

        Returns:
            The consumed item
        """
        email_body = item.get("keywords", {}).get("email_body", [])

        if email_body and isinstance(email_body, list):
            email_body_dict = parse_email_body(email_body[0])

            item["keywords"].update(
                {
                    label: value
                    for key, value in email_body_dict.items()
                    if (label := KEYS_TO_LABELS_MAPPING.get(key, ""))
                }
            )

            if "document_date_true" in item.get("keywords", {}):
                document_date = item["keywords"]["document_date_true"][0]
                try:
                    item["keywords"]["document_date_true"] = [
                        parse_document_date(document_date)
                    ]
                except ValueError:
                    item["keywords"].pop("document_date_true", None)
                    self.log.exception(
                        "Document date `%s` is not supported", document_date
                    )

        item["keywords"].pop("email_body", None)

        return item
