from unittest.mock import MagicMock, patch

import pytest
from force_ocr import ForceOCRPipelet

from squirro.lib.storage.handler import StorageHandler


class TestForceOCR:
    def setup_method(self):
        StorageHandler._parse_url = MagicMock()
        StorageHandler._parse_url.return_value = ("bucket", "key", "path")
        self.pipelet = ForceOCRPipelet({})

    def test_no_files(self, item_no_files):
        output = self.pipelet.consume(item_no_files.copy())
        assert item_no_files == output

    def test_parse_url_failed(self, item, caplog):
        with patch.object(StorageHandler, "_parse_url", side_effect=Exception()):
            with pytest.raises(Exception):
                self.pipelet.consume(item.copy())

        assert "Failed to parse content URL." in caplog.text

    @patch("force_ocr.subprocess")
    def test_get_pdf_content_failed(self, mock_subprocess, item, caplog):
        mock_subprocess.run = MagicMock(side_effect=Exception())

        with pytest.raises(Exception):
            self.pipelet.consume(item.copy())

        assert "Failed to extract text from PDF." in caplog.text


@pytest.fixture
def item():
    return {
        "id": "123",
        "files": [
            {
                "content_url": "https://example.com/document.pdf",
                "mime_type": "application/pdf",
            }
        ],
        "keywords": {},
    }


@pytest.fixture
def item_no_files():
    return {
        "id": "123",
        "files": [],
        "keywords": {},
    }


@pytest.fixture(
    params=[
        ("empty-pdf.pdf", True),
        ("first-4-empty.pdf", False),
        ("first-5-empty.pdf", True),
        ("force-ocr.pdf", True),
        ("many-words.pdf", False),
        ("non-ocr.pdf", True),
    ]
)
def test_cases(request):
    return request.param
