# mypy: ignore-errors
"""Dataloader Plugin for WFI."""

import hashlib
import json
import logging
import re
import time
from pathlib import Path
from typing import TYPE_CHECKING

import pandas as pd
import requests

from octopus.clients.redis_client import init_redis_client
from octopus.clients.squirro_client import init_squirro_client
from octopus.utils import load_config
from squirro.dataloader.data_source import DataSource
from squirro_client import SquirroClient

if TYPE_CHECKING:
    from collections.abc import Generator
    from typing import Any, Dict, List

    from redis import Redis

_N_RETRY_ITEMS = 10


logging.basicConfig(level=logging.INFO)


class WFILoader:
    """WFI document loader."""

    # pylint: disable=too-many-arguments
    def __init__(
        self,
        *,
        api_key: str,
        channel: str,
        domain: str,
        env: str,
        object_store: str,
        port: int,
        url: str,
    ):
        """Initialize WFI loader.

        Args:
            api_key (str): WFI API key.
            channel (str): WFI channel.
            domain (str): WFI domain.
            env (str): WFI environment.
            object_store (str): WFI object store.
            port (int): WFI port.
            url (str): WFI URL.

        Raises:
            Exception: Error initializing WFI loader.
        """
        wfi_api_header_name = (
            "wfiapi-octopus-api-key"
            if env in ("DR", "PROD")
            else "wfiapi-ecoffice-api-key"
        )
        self.wfi_url = f"{url}:{port}/wfiapi"

        self.headers = {
            "domain": domain,
            "channel": channel,
            "objectstore": object_store,
            wfi_api_header_name: api_key,
        }

    def fetch_document(self, doc_id: str) -> "Dict":
        """Get document content and metadata from WFI.

        Args:
            doc_id (str): Document id.

        Returns:
            Dict: Document content and metadata.

        Raises:
            Exception: Error fetching document from WFI.
        """
        try:
            res_content: requests.models.Response = requests.get(
                f"{self.wfi_url}/fetchcontentbydocid/{{{doc_id}}}",
                headers=self.headers,
                timeout=300,
            )
            res_content.raise_for_status()

            if (content_type := res_content.headers.get("content-type")) is None:
                msg = f"Content type is empty for document {doc_id}"
                logging.error(msg)
                raise ValueError(msg)

            if (content := res_content.content) is None:
                msg = f"Content is empty for document {doc_id}"
                logging.error(msg)
                raise ValueError(msg)

            if (
                content_disposition := res_content.headers.get("content-disposition")
            ) is None or (
                filename := re.findall("filename=(.+)", content_disposition)[0]
            ) is None:
                msg = f"Filename is empty for document {doc_id}"
                logging.error(msg)
                raise ValueError(msg)
            filename = filename.replace('"', "")

            return {
                "wfi_id_original": doc_id,
                "wfi_document_id": doc_id,
                "filename": filename,
                "contentBytes": content,
                "contentType": content_type,
            }
        except requests.HTTPError:
            logging.exception("Failed to fetch document %s", doc_id)
            raise

    @staticmethod
    def extract_doc_ids(**kwargs) -> "List[str]":
        """Extract document ids from WFI JSON output.

        Args:
            csv_path (str): Path to CSV file containing document ids.
            json_path (str): Path to JSON file containing document ids.
            n_csv_items (str): Number of items to extract from CSV file.

        Returns:
            List[str]: List of document ids.

        Raises:
            Exception: Error extracting document ids.
        """
        if (csv_path := kwargs.get("csv_path")) is not None:
            try:
                df = pd.read_csv(csv_path, sep="|")
            except Exception:
                logging.exception("Failed to read CSV file")
                raise

            n_csv_items = min(len(df), kwargs["n_csv_items"])
            doc_ids = df["object_id"].head(n_csv_items).tolist()

            # Remove the extracted rows from file
            df = df.drop(range(n_csv_items))
            df = df.reset_index(drop=True)

            try:
                df.to_csv(csv_path, sep="|", index=False)
            except Exception:
                logging.exception("Failed to write CSV file")
                raise
        else:
            try:
                json_file = Path(kwargs["json_path"])
                with json_file.open("r", encoding="utf-8") as f:
                    data: "List[str]" = json.load(f)["resultSet"]
                    doc_ids = []
                    for item in data:
                        doc_ids.append(item["propertyNameValuePair"]["Id"][1:-1])
            except Exception:
                logging.exception("Failed to read JSON file")
                raise

        return doc_ids


class WFIDataLoader(DataSource):  # pylint: disable=abstract-method
    """WFI data loader Plugin."""

    redis_client: "Redis"
    wfi_loader: WFILoader
    _squirro_client: "SquirroClient" = None

    def connect(self, _=None, __=None) -> None:
        """Connect to the source."""
        self.redis_client = init_redis_client()
        self._squirro_client, self.project_id = init_squirro_client()

        cfg = load_config()
        try:
            logging.info("Initializing wfi loader")
            self.wfi_loader = WFILoader(
                api_key=cfg.get("wfi", "api_key"),
                channel=cfg.get("wfi", "channel"),
                domain=cfg.get("wfi", "domain"),
                env=cfg.get("squirro", "environment"),
                object_store=cfg.get("wfi", "objectstore"),
                port=int(cfg.get("wfi", "port")),
                url=cfg.get("wfi", "url"),
            )
        except Exception:
            logging.exception("Failed to initialize wfi loader")
            raise

    def disconnect(self) -> None:
        """Disconnect from the source."""

    # pylint: disable-next=invalid-name
    def getDataBatch(  # noqa: N802
        self, batch_size: int
    ) -> "Generator[List[Dict], Any, None]":
        """
        Generator - Get data from source on batches.

        :returns a list of dictionaries
        """
        logging.info(
            "Getting data from %s source...", "json" if self.args.json_path else "csv"
        )
        if self.args.json_path:
            doc_ids = WFILoader.extract_doc_ids(json_path=self.args.json_path)
        else:
            doc_ids = WFILoader.extract_doc_ids(
                csv_path=self.args.csv_path, n_csv_items=self.args.n_csv_items
            )

        # Try adding failed items from redis as well
        failed_doc_ids = self._retry_failed_items()
        doc_ids.extend(failed_doc_ids)
        logging.info("Total number of documents: %d", len(doc_ids))

        logging.info("Uploading data...")
        items: "List[Dict]" = []
        for doc_id in doc_ids:
            query = f"wfi_document_id:{doc_id}"
            result: "Dict" = self._squirro_client.query(  # type: ignore[type-arg]
                self.project_id, query=query, count=1
            )

            if item_dup := result.get("items", []):
                self.redis_client.set(
                    "wfi_duplicated_item_hash",
                    f"{item_dup[0]['id']}_{doc_id}",
                )
                continue

            logging.info("Fetching content for %s", doc_id)
            try:
                doc = self.wfi_loader.fetch_document(doc_id)
                items.append(doc)
                time.sleep(1)
            except Exception:  # pylint: disable=broad-except
                # Store item in redis to retry later
                self.redis_client.zadd(
                    "failed_wfi_to_squirro", {doc_id: int(time.time())}
                )
                logging.exception("Fail to fetch content for %s", doc_id)
                continue

            if len(items) >= batch_size:
                yield items
                items = []

        if items:
            yield items

    def _retry_failed_items(self) -> "List[str]":
        """Retry failed items from redis."""
        items = self.redis_client.zrange("failed_wfi_to_squirro", 0, _N_RETRY_ITEMS - 1)
        doc_ids = []
        for item in items:
            doc_ids.append(item.decode())
            self.redis_client.zrem("failed_wfi_to_squirro", item)
        return doc_ids

    # pylint: disable-next=invalid-name
    def getSchema(self) -> "List[str]":  # noqa: N802
        """Return the schema of the dataset :returns a List containing the
        names of the columns retrieved from the source."""
        return [
            "contentBytes",
            "contentType",
            "wfi_document_id",
            "wfi_id_original",
            "filename",
        ]

    # pylint: disable-next=invalid-name
    def getJobId(self):  # noqa: N802
        """Return a unique string for each different select :returns a
        string."""
        # Generate a stable id that changes with the main parameters
        m = hashlib.sha256()
        m.update(str(self.args.number_of_posts).encode("utf-8"))
        job_id = m.hexdigest()
        logging.debug("Job ID: %s", job_id)
        return job_id

    # pylint: disable-next=invalid-name
    def getArguments(self) -> "List[Dict]":  # noqa: N802
        return [
            {
                "name": "csv_path",
                "display_label": "CSV file path",
                "help": "File path to .csv file.",
                "type": "str",
            },
            {
                "name": "n_csv_items",
                "display_label": "Number of CSV items",
                "help": "Number of items to read from CSV file.",
                "type": "int",
            },
            {
                "name": "json_path",
                "display_label": "JSON file path",
                "help": "File path to .json file.",
                "type": "str",
            },
        ]
