seshat-tts

2026-05-22 05:54:01 -04:00
commit 75fc1afa53
48 changed files with 4192 additions and 0 deletions
@@ -0,0 +1,80 @@
+from pathlib import Path
+import json
+
+from seshat_tts.config import AppConfig, Rect, load_config, save_config
+
+
+def test_config_roundtrip(tmp_path: Path) -> None:
+    path = tmp_path / "config.json"
+    config = AppConfig(
+        monitor_index=2,
+        hotkey="ctrl+shift+d",
+        capture_region_hotkey="ctrl+shift+r",
+        stop_hotkey="ctrl+shift+s",
+        dialogue_rect=Rect(left=1, top=2, width=3, height=4),
+        tesseract_cmd="C:/Tesseract/tesseract.exe",
+        voice_source="custom-wav",
+        default_voice="alba",
+        voice_path="voice.mp3",
+        language="english",
+        quantize_tts=True,
+        volume_gain=1.75,
+        last_text="hello",
+    )
+
+    save_config(config, path)
+
+    assert load_config(path) == config
+
+
+def test_load_config_removes_old_region_metadata_from_last_text(tmp_path: Path) -> None:
+    path = tmp_path / "config.json"
+    path.write_text(
+        json.dumps(
+            {
+                "last_text": "Capture region: 85,51 628x84\nText region: 85,44 633x77\n\nA line to read."
+            }
+        ),
+        encoding="utf-8",
+    )
+
+    assert load_config(path).last_text == "A line to read."
+
+
+def test_load_config_reads_llm_settings(tmp_path: Path) -> None:
+    path = tmp_path / "config.json"
+    path.write_text(
+        json.dumps(
+            {
+                "llm_enabled": True,
+                "llm_base_url": "http://127.0.0.1:11434/v1",
+                "llm_api_key": "local",
+                "llm_model": "unsloth-local",
+                "llm_timeout": 1.5,
+                "llm_max_tokens": 64,
+                "llm_disable_thinking": False,
+                "llm_image_extraction": True,
+                "llm_system_prompt": "clean this",
+            }
+        ),
+        encoding="utf-8",
+    )
+
+    config = load_config(path)
+
+    assert config.llm_enabled is True
+    assert config.llm_base_url == "http://127.0.0.1:11434/v1"
+    assert config.llm_api_key == "local"
+    assert config.llm_model == "unsloth-local"
+    assert config.llm_timeout == 1.5
+    assert config.llm_max_tokens == 64
+    assert config.llm_disable_thinking is False
+    assert config.llm_image_extraction is True
+    assert config.llm_system_prompt == "clean this"
+
+
+def test_load_config_forces_english_language(tmp_path: Path) -> None:
+    path = tmp_path / "config.json"
+    path.write_text(json.dumps({"language": "french"}), encoding="utf-8")
+
+    assert load_config(path).language == "english"
@@ -0,0 +1,118 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+
+from PIL import Image
+
+from seshat_tts.llm import process_image_with_llm, process_text_with_llm
+
+
+@dataclass
+class _Message:
+    content: str
+
+
+@dataclass
+class _Choice:
+    message: _Message
+
+
+@dataclass
+class _Response:
+    choices: list[_Choice]
+
+
+class _Completions:
+    def __init__(self) -> None:
+        self.kwargs: dict[str, object] | None = None
+
+    def create(self, **kwargs: object) -> _Response:
+        self.kwargs = kwargs
+        return _Response([_Choice(_Message("Cleaned text."))])
+
+
+class _Client:
+    def __init__(self) -> None:
+        self.chat = type("Chat", (), {"completions": _Completions()})()
+
+
+def test_llm_disabled_returns_original_text() -> None:
+    assert (
+        process_text_with_llm(
+            " OCR text ",
+            enabled=False,
+            base_url="http://127.0.0.1:8000/v1",
+            api_key="local",
+            model="unsloth",
+            system_prompt="clean",
+        )
+        == "OCR text"
+    )
+
+
+def test_llm_enabled_uses_openai_compatible_chat_client() -> None:
+    client = _Client()
+
+    result = process_text_with_llm(
+        "OCR text",
+        enabled=True,
+        base_url="http://127.0.0.1:8000/v1",
+        api_key="local",
+        model="unsloth-model",
+        system_prompt="clean",
+        timeout=1,
+        max_tokens=32,
+        client=client,
+    )
+
+    assert result == "Cleaned text."
+    assert client.chat.completions.kwargs is not None
+    assert client.chat.completions.kwargs["model"] == "unsloth-model"
+    assert client.chat.completions.kwargs["temperature"] == 0
+    assert client.chat.completions.kwargs["extra_body"] == {
+        "chat_template_kwargs": {"enable_thinking": False},
+        "enable_thinking": False,
+        "reasoning_effort": "none",
+    }
+
+
+def test_llm_can_send_without_disable_thinking_metadata() -> None:
+    client = _Client()
+
+    process_text_with_llm(
+        "OCR text",
+        enabled=True,
+        base_url="http://127.0.0.1:8000/v1",
+        api_key="local",
+        model="unsloth-model",
+        system_prompt="clean",
+        disable_thinking=False,
+        client=client,
+    )
+
+    assert client.chat.completions.kwargs is not None
+    assert "extra_body" not in client.chat.completions.kwargs
+
+
+def test_llm_can_extract_text_from_image_region() -> None:
+    client = _Client()
+    image = Image.new("RGB", (16, 8), "black")
+
+    result = process_image_with_llm(
+        image,
+        base_url="http://127.0.0.1:8000/v1",
+        api_key="local",
+        model="vision-model",
+        timeout=1,
+        max_tokens=64,
+        client=client,
+    )
+
+    assert result == "Cleaned text."
+    assert client.chat.completions.kwargs is not None
+    assert client.chat.completions.kwargs["model"] == "vision-model"
+    messages = client.chat.completions.kwargs["messages"]
+    user_content = messages[1]["content"]
+    assert user_content[0]["type"] == "text"
+    assert user_content[1]["type"] == "image_url"
+    assert user_content[1]["image_url"]["url"].startswith("data:image/png;base64,")
@@ -0,0 +1,22 @@
+from seshat_tts.ocr import extract_text_from_lines
+
+
+def test_selected_text_does_not_skip_first_line() -> None:
+    lines = ["A large group of humanoids came from the foothills", "and headed north not long ago."]
+
+    assert (
+        extract_text_from_lines(lines)
+        == "A large group of humanoids came from the foothills and headed north not long ago."
+    )
+
+
+def test_selected_text_includes_choice_marker_text_when_inside_region() -> None:
+    lines = ["Line to read.", "|. Continue"]
+
+    assert extract_text_from_lines(lines) == "Line to read. |. Continue"
+
+
+def test_selected_text_includes_pipe_marker_without_dot_when_inside_region() -> None:
+    lines = ["Line to read.", "| Continue"]
+
+    assert extract_text_from_lines(lines) == "Line to read. | Continue"
@@ -0,0 +1,34 @@
+from pathlib import Path
+import queue
+
+from seshat_tts import tts
+
+
+def test_prepared_audio_prompt_leaves_wav_unchanged(tmp_path: Path) -> None:
+    source = tmp_path / "voice.wav"
+    source.write_bytes(b"wav")
+
+    assert tts._prepared_audio_prompt_path(source, "english", queue.Queue()) == source
+
+
+def test_prepared_audio_prompt_converts_mp3_once(tmp_path: Path, monkeypatch) -> None:
+    source = tmp_path / "voice.mp3"
+    source.write_bytes(b"mp3")
+    cache = tmp_path / "cache"
+    calls: list[tuple[Path, Path]] = []
+
+    monkeypatch.setattr(tts, "VOICE_CACHE_DIR", cache)
+
+    def fake_convert(input_path: Path, output_path: Path) -> None:
+        calls.append((input_path, output_path))
+        output_path.write_bytes(b"wav")
+
+    monkeypatch.setattr(tts, "_convert_mp3_to_wav", fake_convert)
+
+    first = tts._prepared_audio_prompt_path(source, "english", queue.Queue())
+    second = tts._prepared_audio_prompt_path(source, "english", queue.Queue())
+
+    assert first == second
+    assert first.suffix == ".wav"
+    assert first.exists()
+    assert calls == [(source, first)]