"""Date extraction utilities."""

import logging
import re
from datetime import datetime
from typing import TYPE_CHECKING

import pytz

from octopus.text.preprocess import sanitize_text

if TYPE_CHECKING:
    from re import Pattern

MONTH_LONG = {
    "January": 1,
    "February": 2,
    "March": 3,
    "April": 4,
    "May": 5,
    "June": 6,
    "July": 7,
    "August": 8,
    "September": 9,
    "October": 10,
    "November": 11,
    "December": 12,
}
MONTH_SHORT = {
    "Jan": 1,
    "Feb": 2,
    "Mar": 3,
    "Apr": 4,
    "May": 5,
    "Jun": 6,
    "Jul": 7,
    "Aug": 8,
    "Sep": 9,
    "Oct": 10,
    "Nov": 11,
    "Dec": 12,
}
MONTH_MAPPING = {k.lower(): v for k, v in {**MONTH_LONG, **MONTH_SHORT}.items()}


def _generate_regexes() -> "Pattern[str]":
    """Construct regex to match dates.

    Returns:
        Regex to match dates.
    """
    regexes = {
        ##############################
        # Day Month Year
        ##############################
        # 01 Feb 2024
        r"(?P<day_0>{d})\s?(?P<month_0>{m2})\s?(?P<year_0>{y})",
        # 01 February 2024
        r"(?P<day_1>{d})\s?(?P<month_1>{m3})\s?(?P<year_1>{y})",
        # 01-02-2024
        r"(?P<day_2>{d})-(?P<month_2>{m1})-(?P<year_2>{y})",
        # 01-Feb-2024
        r"(?P<day_3>{d})-(?P<month_3>{m2})-(?P<year_3>{y})",
        # 01-February-2024
        r"(?P<day_4>{d})-(?P<month_4>{m3})-(?P<year_4>{y})",
        # 01/02/2024
        r"(?P<day_5>{d})/(?P<month_5>{m1})/(?P<year_5>{y})",
        ##############################
        # Month Day Year
        ##############################
        # Feb 01 2024
        r"(?P<month_10>{m2})\s?(?P<day_10>{d}) (?P<year_10>{y})",
        # February 01 2024
        r"(?P<month_11>{m3})\s?(?P<day_11>{d}) (?P<year_11>{y})",
        # Feb 01, 2024
        r"(?P<month_12>{m2})\s?(?P<day_12>{d}),\s?(?P<year_12>{y})",
        # February 01, 2024
        r"(?P<month_13>{m3})\s?(?P<day_13>{d}),\s?(?P<year_13>{y})",
        ##############################
        # Year Month Day
        ##############################
        # 2024-02-01
        r"(?P<year_20>{y})-(?P<month_20>{m1})-(?P<day_20>{d})",
        # 2024-Feb-01
        r"(?P<year_21>{y})-(?P<month_21>{m2})-(?P<day_21>{d})",
        # 2024-February-01
        r"(?P<year_22>{y})-(?P<month_22>{m3})-(?P<day_22>{d})",
        # 2024/02/01
        r"(?P<year_30>{y})/(?P<month_30>{m1})/(?P<day_30>{d})",
        # 2024/Feb/01
        r"(?P<year_31>{y})/(?P<month_31>{m2})/(?P<day_31>{d})",
        # 2024/February/01
        r"(?P<year_32>{y})/(?P<month_32>{m3})/(?P<day_32>{d})",
    }
    pattern = (
        r"\b("
        + "|".join(
            r.format(
                # Single digit day can come with or without a leading zero
                # Days ending with 1, 2, 3 can come with a suffix st, nd, rd
                d="((3[01])|([12][0-9])|(0?[1-9]))(st|nd|rd)?",
                m1="(0?[1-9]|1[0-2])",
                m2="|".join(MONTH_SHORT),
                m3="|".join(MONTH_LONG),
                y=r"\d{4}",
            )
            for r in regexes
        )
        + r")\b"
    )

    return re.compile(pattern, re.IGNORECASE)


REGEX = _generate_regexes()


def _get_today_date() -> str:
    """Get today's date in SGT.

    Returns:
        Today's date in the format "YYYY-MM-DD".
    """
    return datetime.now(pytz.timezone("Asia/Singapore")).strftime("%Y-%m-%d")


def _parse_value(value: str) -> int:
    """Parse date value.

    Args:
        value: Date value to parse.

    Returns:
        Parsed date value.
    """
    if value.lower() in MONTH_MAPPING:
        return MONTH_MAPPING[value]
    value = re.sub(r"(st|nd|rd)", "", value)
    return int(value)


def date_extraction(txt: str) -> str:
    """Extract date from text.

    Args:
        txt: Text to extract date from.

    Returns:
        Date in the format "YYYY-MM-DD".
    """
    logging.info("Extracting date from text.")
    clean = sanitize_text(txt, keep_chars="all")
    matches = re.finditer(REGEX, clean)
    for match in matches:
        dates = {
            str(field.split("_")[0]): _parse_value(value)
            for field, value in match.groupdict().items()
            if value is not None
        }

        try:
            return datetime(
                dates["year"],
                dates["month"],
                dates["day"],
                tzinfo=pytz.timezone("Asia/Singapore"),
            ).strftime("%Y-%m-%d")
        except Exception:
            logging.exception("Error parsing date.")
            continue

    logging.info("No date found in text, returning today's date.")
    return _get_today_date()
