"""Document type clasification."""

import logging
import re
from typing import TYPE_CHECKING

from octopus.text.preprocess import sanitize_text

if TYPE_CHECKING:
    from re import Pattern
    from typing import Any


REGEX_ZIP_DOC_TYPE = re.compile(r"credit (memo|memorandum|proposal|proposition|review)")
MAPPING_LABELS: "dict[str, str]" = {}
REGEX_MAPPING_ML_PRED_DOC_TYPE: "dict[str, Pattern[str]]" = {}
REGEX_INTERNAL_DOCUMENTS_BBCA = re.compile(
    "compliance checklist form 5|credit review checklist"
)
MAPPING_ZIP_DOC_TYPE = {
    "memo": "CM",
    "memorandum": "CM",
    "proposal": "CP",
    "proposition": "CP",
    "review": "CR",
}


def doc_type_pred(item: "dict[str, Any]", pages: str) -> None:
    """Document type prediction.

    Args:
        item: The item to tag
        pages: The pages to use for prediction
    """
    pred: str = item["keywords"].get("document_type_pred_ml", ["Unclassified"])[0]
    logging.info("Document type prediction from ML: %s", pred)

    if zip_ref := item["keywords"].get("zip_reference", [""])[0]:
        logging.info("Zip reference detected: %s", zip_ref)
        if match := REGEX_ZIP_DOC_TYPE.search(sanitize_text(zip_ref)):
            pred = MAPPING_ZIP_DOC_TYPE[match.group(1)]
    elif pages and pred in REGEX_MAPPING_ML_PRED_DOC_TYPE:
        logging.info("Running secondary prediction: %s", pred)
        match = REGEX_MAPPING_ML_PRED_DOC_TYPE[pred].search(sanitize_text(pages))
        if match and match.group(0):
            pred = MAPPING_LABELS[pred]
    else:
        logging.info("Checking if document is a Internal Documents BBCA")
        if REGEX_INTERNAL_DOCUMENTS_BBCA.search(sanitize_text(pages)):
            pred = "INTERNAL DOCUMENTS (BBCA)"

    logging.info("Document type prediction: %s", pred)
    item["keywords"]["document_type_pred"] = [pred]
