# mypy: ignore-errors
import logging
import re
import tempfile
from datetime import datetime
from io import BytesIO
from typing import TYPE_CHECKING

import camelot
import pandas as pd
import requests
from flask import make_response, render_template, request, send_file
from openpyxl import Workbook
from openpyxl.utils import get_column_letter

from squirro.common.config import get_config
from squirro.common.dependency import DependencyNotFound, get_injected
from squirro.common.service_users import InternalServiceUser
from squirro.lib.storage.handler import StorageHandler
from squirro.sdk.studio import StudioPlugin
from squirro_client.exceptions import NotFoundError

if TYPE_CHECKING:

    from flask import Response

plugin = StudioPlugin(__name__)
log = logging.getLogger(__name__)

logging.basicConfig(level=logging.DEBUG)
logging.getLogger("pdfminer").setLevel(logging.WARNING)

# Initialise config
try:
    config = get_injected("config")
except DependencyNotFound:
    pass


def _get_frontend_token(tenant: str) -> str:
    internal_service_user = InternalServiceUser(
        client_id=config.get("user_service", "client_id"),
        client_secret=config.get("user_service", "client_secret"),
    )
    try:
        service_user_token = internal_service_user.get_service_user_token(
            service_name="frontend", tenant=tenant
        )
    except Exception:
        log.exception("Failed to get user token.")
        raise

    return service_user_token


@plugin.route("/table", methods=["GET"], allow_project_readers=True)
# pylint: disable-next=too-many-locals, too-many-statements
def get_table() -> "Response | tuple[str, str] | None":
    """Entry point for extracting table based off item details :returns excel
    document via buffer.
    """
    try:
        client = get_injected("squirro_client")
        frontend_token = _get_frontend_token(client.tenant)
    except Exception:
        log.exception("An error occured while instantiating Squirro client.")
        raise

    args = request.args
    project_id = args.get("project_id")

    try:
        project_name = client.get_project(project_id).get("title")
    except Exception:  # pylint: disable=broad-except
        log.exception("Could not get project details.")

    try:
        item = client.get_item(project_id, args.get("item_id"))
    except NotFoundError:
        ret = make_response("<h1>Could not find the given document.")
        ret.status_code = 404
        return ret
    files = item.get("item", {}).get("files")
    if not files:
        ret = make_response("<h1>File attachment not found.")
        ret.status_code = 404
        return ret

    # This is a bit of a hack, as we don't get the original storage URL back
    # here. Instead our URL here is /storage/….
    if files[0]["link"].startswith("/storage/"):
        content_url = "storage://" + files[0]["link"][9:]
    elif files[0]["link"].startswith("/studio/storage/"):
        content_url = "storage://" + files[0]["link"][16:]
    else:
        raise ValueError(f"Storage link could not be identified: {files[0]['link']}")

    # Write extracted tables to buffer
    try:
        storage_config = get_config("squirro.lib.storage")
        storage = StorageHandler(storage_config)
        content = storage.download(content_url)
    except Exception:
        log.exception("An error occured while trying to download %s", content_url)
        raise

    with tempfile.NamedTemporaryFile(suffix=".pdf") as tmpf:
        tmpf.write(content)
        tmpf.seek(0)

        try:
            table = extract_table(tmpf.name, args.to_dict())
        except Exception:
            log.exception("An error occured while extracting table")
            raise

    output_fname = item["item"]["title"]

    # How can this be improved
    if not table:
        update_activity_monitoring(
            args, frontend_token, project_name, output_fname, pdf_outcome="Fail"
        )
        try:
            return (
                render_template(
                    "no_tables.html",
                    page=args["page"],
                    title=output_fname,
                    project_name=project_name,
                ),
                200,
            )
        except Exception:
            log.exception(
                "An error occured while rendering template where no table is found."
            )
            raise

    filename = f"{output_fname} - page {args['page']}.xlsx"

    update_activity_monitoring(
        args, frontend_token, project_name, output_fname, pdf_outcome="Success"
    )

    try:
        return send_file(
            table,
            as_attachment=True,
            mimetype=(
                "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
            ),
            attachment_filename=filename,
        )
    except Exception:  # pylint: disable=broad-except
        log.exception(
            "An error occured while sending back the excel spreadsheet with "
            "extracted tables."
        )

    return None


def update_activity_monitoring(
    args,
    frontend_token: str,
    project_name: str,
    title: str = None,
    pdf_outcome: str = None,
) -> None:
    """Function to update activity monitoring based on extraction success/fail
    :return None.
    """
    activity_data = {
        "action": f"custom.pdfExtraction{pdf_outcome}",
        "pdf_document_id": args.get("item_id"),
        "pdf_table_area": create_table_area(args.to_dict()),
        "pdf_document_title": title,
        "project": {"id": args.get("project_id"), "title": project_name},
        "now": datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%S"),
        "pdf_page": args.get("page"),
        "project_name": project_name,
    }

    try:
        requests.post(
            f"http://localhost/activity?token={frontend_token}",
            json=activity_data,
            timeout=10,
        )
    except Exception:
        log.exception("Failed to send request to the /activity endpoint")
        raise


def create_table_area(params: "dict[str, str]") -> str:
    """Function to create table regions based off selection params :returns
    table area as string.
    """
    try:
        start_x = int(params["start_x"])
        start_y = int(params["pdf_height"]) - int(params["start_y"])
        end_x = int(params["end_x"])
        end_y = int(params["pdf_height"]) - int(params["end_y"])
    except Exception:
        log.exception(
            "Invalid request params. Failed to retrieve the coordinates for extraction."
        )
        raise

    table_area: str = f"{start_x},{start_y},{end_x},{end_y}"

    return table_area


# pylint: disable=too-many-branches, too-many-locals, too-many-statements
def extract_table(fname: str, params: "dict[str, str]") -> "BytesIO | None":
    """Main function responsible for extracting tables based on area :returns
    tables as buffer.
    """
    page: str = params["page"]

    # Work out coordinates based on height/pdf height, width/pdf/width
    table_area: str = create_table_area(params)

    # Specify table areas
    # See here:
    # https://camelot-py.readthedocs.io/en/master/user/advanced.html#specify-table-areas
    # Note
    # table_areas accepts strings of the form x1,y1,x2,y2
    # where (x1, y1) -> top-left and (x2, y2) -> bottom-right in PDF coordinate space.
    # In PDF coordinate space, the bottom-left corner of the page is the origin,
    # with coordinates (0, 0).

    # On top of that, `table_area` is a string: table_areas=['316,499,566,337']
    x1: float
    y1: float
    x2: float
    y2: float
    x1, y1, x2, y2 = (float(coord) for coord in table_area.split(","))

    if x1 > x2:
        log.info("Swapping horizontal coordinates")
        x1, x2 = x2, x1

    if y2 > y1:
        log.info("Swapping vertical coordinates")
        y1, y2 = y2, y1

    table_area = f"{x1:.1f},{y1:.1f},{x2:.1f},{y2:.1f}"

    tables = None

    modes: list[tuple[str, dict]] = [
        ("lattice", {"process_background": True}),
        ("stream", {}),
        ("lattice", {"line_scale": 50}),
    ]

    for mode, mode_args in modes:
        log.debug("Attempting extraction with %s mode", mode)
        try:
            tables = camelot.read_pdf(
                fname, flavor=mode, pages=page, table_areas=[table_area], **mode_args
            )
            if len(tables) > 0:
                log.debug("Found %d tables using %s mode", len(tables), mode)
                break
        except Exception:  # pylint: disable=broad-except
            log.exception("Could not extract table due to error: %s")

    if not tables:
        return None

    wb = Workbook()
    write_formats: dict[str, str] = {
        "percentage_format": "0.00%",
        "float_format": "0.00",
        "integer_format": "#,##0",
        "currency_format": "$#,##0",
    }

    for i, table in enumerate(tables):
        sheet_name: str = f"Page {table.page} - Table {table.order}"

        if i == 0:
            ws = wb.active
            ws.title = sheet_name
        else:
            ws = wb.create_sheet(sheet_name)

        df: pd.DataFrame = table.df

        # Drop leading and trailing columns that are empty
        df_na_values: pd.DataFrame = df.isna()
        df_empty_rows: pd.Series = df_na_values.all(axis=1)
        df_empty_columns: pd.Series = df_na_values.all()

        df_empty_rows[1:-1] = False
        df_empty_columns[1:-1] = False

        df = df.loc[~df_empty_rows, :]
        df = df.loc[:, ~df_empty_columns]

        # Loop over every cell in df
        for c, col in enumerate(df, 1):
            series: pd.Series = df[col]
            max_col_len: int = len(str(series.name))

            for r, value_str in enumerate(series, 1):
                max_col_len = max(max_col_len, len(str(value_str)))

                value: str | int | float
                formatting: str | None
                value, formatting = Helper.get_format(value_str)

                # Write to ws with correct values and formats
                if formatting is not None:
                    ws.cell(row=r, column=c, value=value).number_format = write_formats[
                        formatting
                    ]
                else:
                    ws.cell(row=r, column=c, value=value)

            # Set the column size
            ws.column_dimensions[get_column_letter(c)].width = max_col_len + 1

    with tempfile.NamedTemporaryFile() as tmp:
        wb.save(tmp.name)
        tmp.seek(0)
        stream = tmp.read()

    return BytesIO(stream)


class Helper:
    """Helper class for formatting values."""

    REGEX_TO_FORMAT: "dict[str, str]" = {
        # 1, -10, 1000, 1,000, 1,000,000
        r"^[+-]?[\s]?(\d+|\d{1,3}(,\d{3})*)$": "integer_format",
        # 1.1 -1.1, 1000.1, 1,000.1
        r"^[+-]?[\s]?(\d*|\d{1,3}(,\d{3})*)[.]\d*$": "float_format",
        # 10% 1.1% -10%
        r"^[+-]?[\s]?(\d+|\d{1,3}(,\d{3})*)(?:\.\d+)?\s?%$": "percentage_format",
        # $1 $1,000 $10.1,
        r"^\$[\s]?(0|[1-9][0-9]*)(?:,\d{3})*(\.\d{1,2})?$": "currency_format",
    }

    @staticmethod
    def get_format(
        value: "str | None",
    ) -> "tuple[str | int | float, str | None]":
        if value is None or value == "":
            return ("", None)

        for regex, output_format in Helper.REGEX_TO_FORMAT.items():
            try:
                if not re.search(regex, value):
                    continue

                ret_value: str | int | float = Helper.strip_chars(value)

                if output_format == "integer_format":
                    ret_value = int(ret_value)
                else:
                    ret_value = (
                        float(ret_value)
                        if output_format != "percentage_format"
                        else float(ret_value) / 100
                    )

                return (ret_value, output_format)

            except ValueError:
                log.exception("An error occured while formatting the value %s", value)
                continue
        return (value, None)

    @staticmethod
    def strip_chars(string: str, regex: str = r"[\s%,\$]") -> str:
        return re.sub(regex, "", string)
