"""Update items in Squirro from a CSV file."""

import hashlib
import logging
import time
from argparse import ArgumentParser
from io import BytesIO
from pathlib import Path
from typing import TYPE_CHECKING

import numpy as np
import pandas as pd
import requests

from octopus.utils import load_config, set_log_verbosity
from squirro_client import SquirroClient

if TYPE_CHECKING:
    from typing import Any


set_log_verbosity(logging.INFO)
STR_LEN = 36


################################################################################
parser = ArgumentParser()
parser.add_argument("-i", "--input", type=str, help="Input CSV file to process")
parser.add_argument(
    "--compute-binary-hash",
    action="store_true",
    help="Whether to compute the binary hash",
)
parser.add_argument("--fields", nargs="+", help="Fields to modify")
args = parser.parse_args()

cfg = load_config()
project_id = cfg.get("squirro", "project_id")
base_url = f"{cfg['wfi']['url']}:{cfg['wfi']['port']}/wfiapi"
headers = {
    "domain": cfg["wfi"]["domain"],
    "objectstore": cfg["wfi"]["objectstore"],
    "channel": cfg["wfi"]["channel"],
    "wfiapi-octopus-api-key": cfg["wfi"]["api_key"],
}

sq_client = SquirroClient(
    None,
    None,
    cluster=cfg.get("squirro", "cluster"),
)
sq_client.authenticate(refresh_token=cfg.get("squirro", "token"))
################################################################################


def check_if_file_exists(str_path: str) -> Path:
    """Check if a file exists.

    Args:
        str_path: The path to the file

    Returns:
        The path to the file if it exists

    Raises:
        FileNotFoundError: If the file does not exist
    """
    path = Path(str_path)
    if path.exists() and path.is_file():
        return path

    msg = f"File {path} does not exist"
    logging.error(msg)
    raise FileNotFoundError(msg)


def compute_hash(doc_id: str) -> str:
    """Compute a binary hash for the given binary content.

    Args:
        doc_id: The document id

    Returns:
        The hash

    Raises:
        ValueError: If the content is empty
    """
    res: requests.models.Response = requests.get(
        f"{base_url}/fetchcontentbydocid/{{{doc_id}}}",
        headers=headers,
        timeout=300,
    )
    res.raise_for_status()

    content: bytes
    if (content := res.content) is None:
        msg = f"Content is empty for document {doc_id}"
        logging.error(msg)
        raise ValueError(msg)
    bytes_io = BytesIO(content)

    digest = hashlib.blake2b()
    while chunk := bytes_io.read(4096):
        digest.update(chunk)

    return digest.hexdigest()


def extract_values(
    input_file: Path,
    fields: "list[str] | None" = None,
) -> None:
    """Generate a dataframe containing the pairs of IDs and fields to modify.

    Args:
        input_file: The input file
        fields: The fields to modify
    """
    df = pd.read_csv(input_file, sep="\t", dtype=str)
    ori = df
    res = pd.DataFrame()

    if not fields:
        fields = []

    # id2 is the wfi_document_id stored in Squirro
    # If id2 is null, then id1 is the wfi_document_id stored in Squirro
    res["wfi_document_id"] = df["id2"].fillna(df["id1"])
    res["wfi_id_original"] = df["id1"].fillna(df["id2"])

    # Drop rows if wfi_document_id and wfi_id_original are null
    res = res.dropna(subset=["wfi_document_id", "wfi_id_original"])

    # Drop rows if WFI ID is not 36 characters
    res = res[
        (res["wfi_document_id"].str.len() == STR_LEN)
        & (res["wfi_id_original"].str.len() == STR_LEN)
    ]

    for field in fields:
        f, sq = field.split(":")
        res[sq] = ori[f]

    res.to_csv("to_process.csv", index=False, sep="\t")

    # Save rows where WFI ID does not adhere to 36 characters
    df["id2"] = df["id2"].fillna(df["id1"])
    df["id1"] = df["id1"].fillna(df["id2"])
    df = df[
        (df["wfi_document_id"].str.len() != STR_LEN)
        | (df["wfi_id_original"].str.len() != STR_LEN)
    ]
    df.to_csv("weird.csv", index=False, sep="\t")


def update_squirro_items(  # noqa: C901
    input_file: Path,
    *,
    compute_binary_hash: bool,
) -> None:
    """Update the items in Squirro.

    Args:
        input_file: The input file
        compute_binary_hash: Whether to compute the binary hash
    """
    df = pd.read_csv(input_file, sep="\t", dtype=str)

    # Create a lookup table based on the wfi_document_id from df
    # This is to speed up the process of finding the item in Squirro
    num_splits = max(1, min(len(df) // 10, len(df)))
    failed: list[dict[str, Any]] = []
    for chunk in np.array_split(df, num_splits):
        # Construct query using wfi_document_id=... OR ... OR ...
        query = " OR ".join(
            [f"wfi_document_id:{wfi_id}" for wfi_id in chunk["wfi_document_id"]]
        )

        # Get the items from Squirro
        while True:
            try:
                res: list[dict[str, Any]] = sq_client.query(
                    project_id, query=query, fields=["keywords"], count=10
                )["items"]
                break
            except Exception:
                logging.exception("Retry after 5 seconds")
                time.sleep(5)

        # Modify the items in the batch with columns from df as keywords key
        items = []
        for item in res:
            wid: str = item.get("keywords", {}).get("wfi_document_id", [""])[0]
            logging.info("%s:%s", item["id"], wid)

            # Generate keywords to be modified excluding the wfi_document_id and
            # wfi_id_original and ignore column that is empty or nan
            modify = {
                "id": item["id"],
                "keywords": {
                    k: [v]
                    for k, v in chunk[chunk["wfi_document_id"] == wid].iloc[0].items()
                    if k not in {"wfi_document_id", "wfi_id_original"} and pd.notna(v)
                },
            }

            # Compute the binary hash if required
            if compute_binary_hash and (
                wid_ori := modify["keywords"].get("wfi_id_original", "")[0]
            ):
                try:
                    modify["keywords"]["binary_hash"] = [compute_hash(wid_ori)]
                except Exception:
                    logging.exception("Failed to compute binary hash for %s", wid)
                    continue

            items.append(modify)

        # Update the items in Squirro
        if items:
            while True:
                try:
                    sq_client.modify_items(project_id, items)
                    break
                except Exception:
                    logging.exception("Retry modify items after 5 seconds")
                    time.sleep(5)
        time.sleep(1)

    if failed:
        logging.info("Retry failed items")
        while failed:
            item = failed.pop(0)
            try:
                sq_client.modify_items(project_id, [item])
            except Exception:
                logging.exception("Failed to modify item %s", item)
                failed.append(item)
            time.sleep(1)


def main() -> None:
    """Entrypoint.

    Raises:
        ValueError: If the input file is not provided.
    """
    if not args.input:
        msg = "Input file not provided"
        logging.error(msg)
        raise ValueError(msg)
    logging.info("Reading input file: %s", args.input)
    input_file = check_if_file_exists(args.input)

    # Extract the values from the input file
    extract_values(input_file, args.fields)

    # Update the items in Squirro
    to_process_file = check_if_file_exists("to_process.csv")
    update_squirro_items(to_process_file, compute_binary_hash=args.compute_binary_hash)


if __name__ == "__main__":
    main()
