seshat-tts

2026-05-22 05:54:01 -04:00
commit cdb4563633
48 changed files with 4192 additions and 0 deletions
@@ -0,0 +1,13 @@
+root = true
+
+[*]
+charset = utf-8
+end_of_line = crlf
+insert_final_newline = true
+trim_trailing_whitespace = true
+
+[*.{md,yml,yaml,toml,py}]
+end_of_line = lf
+
+[*.ps1]
+end_of_line = crlf
@@ -0,0 +1,64 @@
+name: Bug report
+description: Report a reproducible Seshat TTS problem
+title: "[Bug]: "
+labels: ["bug", "needs-triage"]
+body:
+  - type: textarea
+    id: summary
+    attributes:
+      label: Summary
+      description: What happened?
+    validations:
+      required: true
+  - type: textarea
+    id: steps
+    attributes:
+      label: Steps to reproduce
+      description: List the exact steps.
+      placeholder: |
+        1. Open app
+        2. Select window
+        3. Press hotkey
+    validations:
+      required: true
+  - type: textarea
+    id: expected
+    attributes:
+      label: Expected behavior
+    validations:
+      required: true
+  - type: textarea
+    id: actual
+    attributes:
+      label: Actual behavior
+    validations:
+      required: true
+  - type: input
+    id: version
+    attributes:
+      label: App version or commit
+  - type: input
+    id: windows
+    attributes:
+      label: Windows version
+  - type: dropdown
+    id: capture-mode
+    attributes:
+      label: Capture mode
+      options:
+        - monitor
+        - window
+        - unknown
+  - type: dropdown
+    id: voice-mode
+    attributes:
+      label: Voice mode
+      options:
+        - default
+        - custom-wav
+        - unknown
+  - type: textarea
+    id: logs
+    attributes:
+      label: Error text or logs
+      description: Remove secrets before pasting.
@@ -0,0 +1,8 @@
+blank_issues_enabled: false
+contact_links:
+  - name: Support
+    url: mailto:support@scriptriva.com
+    about: Contact Scriptriva Inc. for support inquiries.
+  - name: Security vulnerability
+    url: mailto:support@scriptriva.com
+    about: Please report security issues privately.
@@ -0,0 +1,28 @@
+name: Feature request
+description: Suggest a focused improvement
+title: "[Feature]: "
+labels: ["enhancement", "needs-triage"]
+body:
+  - type: textarea
+    id: problem
+    attributes:
+      label: Problem
+      description: What user problem does this solve?
+    validations:
+      required: true
+  - type: textarea
+    id: proposal
+    attributes:
+      label: Proposed solution
+      description: What should change?
+    validations:
+      required: true
+  - type: textarea
+    id: alternatives
+    attributes:
+      label: Alternatives considered
+  - type: textarea
+    id: scope
+    attributes:
+      label: Scope and risks
+      description: What files or behavior might be affected?
@@ -0,0 +1,16 @@
+## Summary
+
+Describe what changed and why.
+
+## Testing
+
+- [ ] `python -m pytest -q`
+- [ ] GUI smoke tested, if UI behavior changed
+- [ ] Build tested, if packaging changed
+
+## Checklist
+
+- [ ] No secrets, voice samples, local configs, or generated binaries committed
+- [ ] Documentation updated if behavior changed
+- [ ] Third-party notices preserved
+- [ ] Change is scoped and maintainable
@@ -0,0 +1,11 @@
+version: 2
+updates:
+  - package-ecosystem: "pip"
+    directory: "/"
+    schedule:
+      interval: "weekly"
+    open-pull-requests-limit: 5
+  - package-ecosystem: "github-actions"
+    directory: "/"
+    schedule:
+      interval: "weekly"
@@ -0,0 +1,36 @@
+name: CI
+
+on:
+  pull_request:
+  push:
+    branches: ["main"]
+
+permissions:
+  contents: read
+
+jobs:
+  tests:
+    name: Tests
+    runs-on: windows-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: ["3.10", "3.13"]
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+          cache: pip
+      - name: Install
+        run: |
+          python -m pip install --upgrade pip
+          python -m pip install -e ".[test]"
+      - name: Compile
+        run: python -m compileall src tests scripts\pyinstaller_entry.py
+      - name: Test
+        env:
+          PYTHONPATH: src
+        run: python -m pytest -q
@@ -0,0 +1,13 @@
+build/*
+build/launcher/*
+dist/*
+scripts/build_output.txt
+*.egg-info/
+__pycache__/*
+.pytest_cache/*
+.venv/
+*.pyc
+tts_output.wav
+*.log
+src\seshat_tts\__pycache__/*
+api_key.txt
@@ -0,0 +1,23 @@
+# Code of Conduct
+
+Seshat TTS follows the [Contributor Covenant Code of Conduct](https://www.contributor-covenant.org/version/2/1/code_of_conduct/).
+
+## Expected Behavior
+
+- Be respectful, direct, and constructive.
+- Assume good intent while still holding technical claims to evidence.
+- Keep discussion focused on the project and its users.
+- Report security or abuse issues through the security process instead of public threads.
+
+## Unacceptable Behavior
+
+- Harassment, threats, discriminatory language, or targeted personal attacks.
+- Publishing private information or credentials.
+- Using the project to support abuse, fraud, malware, unauthorized surveillance, or unauthorized voice cloning.
+- Repeatedly derailing technical discussion after maintainers ask to refocus.
+
+## Enforcement
+
+Scriptriva Inc. maintainers may moderate, edit, hide, lock, or remove issues, comments, pull requests, and releases that violate this code. Serious or repeated violations may result in a ban from project spaces.
+
+For conduct concerns, contact support@scriptriva.com.
@@ -0,0 +1,39 @@
+# Contributing
+
+Thank you for helping improve Seshat TTS. This project is maintained by Scriptriva Inc. and welcomes focused community contributions.
+
+## Development Setup
+
+```powershell
+python -m venv .venv
+.\.venv\Scripts\Activate.ps1
+python -m pip install -e ".[test]"
+$env:PYTHONPATH='src'
+python -m pytest -q
+```
+
+## Contribution Areas
+
+- OCR accuracy and preprocessing.
+- Window capture reliability.
+- TTS stream cancellation and playback.
+- Local OpenAI-compatible LLM cleanup.
+- Packaging and documentation.
+- Accessibility and usability.
+
+## Pull Request Expectations
+
+- Keep changes scoped and explain user-visible behavior.
+- Add or update tests for behavior changes.
+- Do not commit secrets, voice samples, generated voice caches, build outputs, or local config files.
+- Preserve third-party notices and license files.
+- Follow existing code style and avoid unrelated refactors.
+- Run `python -m pytest -q` before opening a pull request.
+
+## Licensing
+
+By contributing, you agree that your contribution may be used under the project license and under any future license chosen by Scriptriva Inc. for this project. You also confirm that you have the right to submit the contribution.
+
+## Security
+
+Do not open public issues for vulnerabilities. Follow [SECURITY.md](SECURITY.md).
@@ -0,0 +1,30 @@
+# Governance
+
+Seshat TTS is maintained by Scriptriva Inc.
+
+## Maintainer Authority
+
+Scriptriva Inc. maintainers decide:
+
+- project roadmap and priorities
+- releases and distribution
+- licensing and branding
+- security response
+- pull request acceptance
+- issue moderation
+
+## Community Contributions
+
+Community contributions are welcome when they align with the project goals and license. Maintainers may request changes, defer work, close inactive issues, or reject changes that increase maintenance risk.
+
+## Decision Principles
+
+- User safety and privacy come first.
+- Runtime reliability matters more than novelty.
+- Captured text, voice files, API keys, and generated caches are sensitive user data.
+- Third-party licenses and notices must remain clear.
+- UI changes should keep the app understandable for non-developer users.
+
+## Maintainer Contact
+
+For support, licensing, governance, or security routing: support@scriptriva.com
@@ -0,0 +1,286 @@
+Scriptriva Public Source License 1.0
+
+Copyright (c) 2026 Scriptriva Inc.
+
+This license may be reused for other projects by replacing the copyright owner,
+year, project metadata, support contact, and any project-specific notices. The
+terms below use universal language so the license can govern any software,
+documentation, assets, build scripts, or related materials distributed under it.
+
+This is a public source license. It allows commercial use subject to the terms
+below. It is not the MIT License, and it is not an Open Source Initiative
+approved open source license because it contains attribution, trademark,
+distribution, safety, and same-functionality product restrictions.
+
+1. Definitions
+
+"Licensor" means the person or entity identified in the copyright notice for the
+Licensed Work.
+
+"Licensed Work" means the source code, object code, documentation, graphical
+interface, build scripts, first-party assets, examples, configuration, and other
+materials distributed by Licensor under this license, except for third-party
+materials governed by separate licenses or notices.
+
+"You" means any individual, company, organization, or other legal entity that
+accesses, uses, copies, modifies, distributes, links to, embeds, hosts, or
+otherwise interacts with the Licensed Work.
+
+"Source Form" means the preferred form of the Licensed Work for reading,
+maintaining, or making modifications.
+
+"Object Form" means any compiled, packaged, bundled, generated, minified,
+containerized, hosted, or machine-executable form of the Licensed Work.
+
+"Modification" means any change, translation, adaptation, refactor, port,
+extension, integration, plugin, compilation, packaging, or derivative work based
+on the Licensed Work.
+
+"Public Project" means any repository, package, release, website, distribution,
+service, application, marketplace listing, publication, or other disclosure made
+available outside your own household or internal organization.
+
+"Commercial Use" means any use intended for, connected to, or resulting in
+direct or indirect monetary value, including selling, licensing, hosting,
+offering paid support, bundling with a paid product, including in a commercial
+service, using in revenue-generating operations, using to provide professional
+services, using in a product trial or freemium funnel, or using to gain a
+commercial advantage.
+
+"Same-Functionality Product" means any software, service, product, script,
+workflow, plugin, system, model, dataset, or hosted offering that reproduces,
+substitutes for, markets against, or provides substantially similar core
+functionality to the Licensed Work. The analysis depends on the purpose and
+functionality of the original project, not only on implementation details.
+
+"First-Party Marks and Assets" means names, trademarks, service marks, logos,
+icons, mascots, banners, artwork, screenshots, trade dress, product identities,
+and branding owned or supplied by Licensor.
+
+2. Grant of Rights
+
+Subject to all terms of this license, Licensor grants you a worldwide,
+royalty-free, non-exclusive, non-transferable, non-sublicensable, revocable
+license to:
+
+a. view, read, inspect, and study the Licensed Work;
+
+b. run the Licensed Work for personal, internal, academic, research, evaluation,
+accessibility, interoperability, and Commercial Use;
+
+c. copy the Licensed Work for backup, archival, testing, internal deployment,
+permitted Commercial Use, and permitted distribution;
+
+d. create Modifications for personal, internal, academic, research, evaluation,
+accessibility, interoperability, and Commercial Use;
+
+e. publish patches, bug reports, interoperability notes, benchmarks, security
+findings, documentation, and compatibility layers related to the Licensed Work;
+
+f. distribute the unmodified Licensed Work or Modifications when the distribution
+complies with this license, including the attribution, notice, branding,
+third-party, and Same-Functionality Product restrictions; and
+
+g. link to, call, embed, automate, or interoperate with the Licensed Work when
+your use complies with this license.
+
+No rights are granted except those expressly stated in this license.
+
+3. Attribution and Notice Requirements
+
+If you copy, modify, distribute, link to, embed, reference, host, or publish any
+portion of the Licensed Work in Source Form or Object Form, you must:
+
+a. preserve all copyright, license, attribution, provenance, and third-party
+notice files;
+
+b. include a clear notice identifying the Licensed Work, the Licensor, the
+copyright notice, and the Scriptriva Public Source License 1.0;
+
+c. identify material Modifications you made;
+
+d. provide a copy of this license with the relevant copy, package, repository,
+release, service, documentation, or distribution;
+
+e. avoid implying that Licensor sponsors, endorses, maintains, certifies, or is
+affiliated with your project unless Licensor gives written permission; and
+
+f. preserve third-party notices and comply with third-party licenses for
+components not owned by Licensor.
+
+For graphical applications, documentation sites, public demos, app stores,
+release pages, hosted services, and packaged distributions, attribution must be
+reasonably visible in at least one of: an About window, license page, notices
+file, README, package metadata, documentation page, legal page, or equivalent
+user-accessible location.
+
+4. Commercial Use
+
+Commercial Use is allowed under this license when it complies with all terms of
+this license. You do not need a separate commercial license merely to use the
+Licensed Work commercially, sell services that use the Licensed Work internally,
+or include the Licensed Work in a larger commercial system, provided that your
+use does not violate Sections 5, 6, 7, or any third-party license.
+
+5. Same-Functionality Product Restrictions
+
+You may not, without prior written permission from Licensor:
+
+a. sell, rent, lease, sublicense, monetize, host, or otherwise charge for access
+to the Licensed Work or a Modification as a standalone Same-Functionality
+Product;
+
+b. use the Licensed Work or a Modification to create, operate, improve, train,
+benchmark, market, or distribute a Same-Functionality Product;
+
+c. rebrand the Licensed Work or a Modification as a competing product;
+
+d. offer the Licensed Work or a Modification as a hosted service, cloud service,
+managed service, paid API, subscription product, marketplace product, paid
+plugin, paid desktop application, or paid utility whose primary purpose is
+substantially similar to the Licensed Work; or
+
+e. use the Licensed Work to train, fine-tune, evaluate, benchmark, or improve a
+model, dataset, agent, or automation intended to compete with or replicate the
+core functionality of the Licensed Work.
+
+This section does not prohibit ordinary Commercial Use permitted by Section 4,
+interoperability, independent implementations that do not use the Licensed Work,
+or public criticism, benchmarking, security review, or compatibility testing.
+
+6. Public Projects, Linking, and Modifications
+
+You may reference, link to, embed, or interoperate with the Licensed Work in a
+Public Project, including a commercial Public Project, only if:
+
+a. the Public Project does not violate the Same-Functionality Product
+restrictions in Section 5;
+
+b. the Public Project includes the attribution required by Section 3;
+
+c. your use does not make First-Party Marks and Assets appear to be your own;
+
+d. your repository, package, documentation, or notice file clearly identifies
+which portions are yours and which portions are derived from, include, or
+interface with the Licensed Work; and
+
+e. any Modification of the Licensed Work remains under this license unless
+Licensor gives written permission for another license.
+
+7. Restrictions
+
+You may not:
+
+a. remove, obscure, or alter copyright, license, trademark, attribution,
+provenance, or third-party notice files;
+
+b. use First-Party Marks and Assets in a way that suggests endorsement,
+affiliation, certification, or ownership by you;
+
+c. redistribute First-Party Marks and Assets separately from the Licensed Work
+unless Licensor gives written permission;
+
+d. use the Licensed Work to violate law, privacy rights, publicity rights, voice
+rights, copyright, platform terms, or anti-circumvention rules;
+
+e. use the Licensed Work to clone, synthesize, imitate, capture, extract, or
+process a person's voice, likeness, private data, credentials, or protected
+content without appropriate rights or permission;
+
+f. use the Licensed Work in surveillance, credential theft, fraud, malware,
+spyware, covert data extraction, or deceptive automation workflows;
+
+g. bypass, disable, remove, or misrepresent license notices, support notices,
+security notices, safety checks, provenance indicators, or attribution flows;
+
+h. grant sublicenses or downstream rights broader than the rights granted to you
+under this license; or
+
+i. use the Licensed Work in a manner that creates legal obligations for Licensor
+or requires Licensor to license First-Party Marks and Assets, patents, trade
+secrets, private data, or third-party materials.
+
+8. Contributions
+
+Unless Licensor signs a separate written agreement with you, any patch, pull
+request, issue suggestion, documentation change, bug fix, code sample, asset,
+or other contribution you submit to Licensor may be used, modified, relicensed,
+sublicensed, distributed, and commercialized by Licensor without restriction or
+payment to you. You represent that you have the right to submit the contribution.
+
+9. Trademarks and Branding
+
+This license does not grant trademark rights. Licensor retains all rights in
+First-Party Marks and Assets. You may use the project name only as necessary for
+accurate attribution, compatibility statements, or references to the unmodified
+Licensed Work.
+
+10. Third-Party Components
+
+Third-party components, dependencies, binaries, models, voices, data files,
+fonts, media, and tools are governed by their own licenses and terms. This
+license does not change, replace, expand, or restrict rights you may have under
+those third-party licenses. If a third-party license gives you broader rights
+for that third-party component, those broader rights apply only to that
+third-party component and not to the Licensed Work owned by Licensor.
+
+11. Additional Permissions
+
+Separate permissions, commercial redistribution permissions,
+Same-Functionality Product permissions, branding permissions, trademark
+permissions, and other exceptions may be available from Licensor. For this
+project, support and licensing inquiries may be sent to support@scriptriva.com.
+
+12. Termination
+
+Your rights under this license terminate automatically if you violate any term
+of this license. Upon termination, you must stop using, copying, modifying,
+distributing, hosting, linking to, or otherwise exploiting the Licensed Work and
+destroy or delete all copies under your control, except copies you are legally
+required to retain for archival compliance. Sections intended to survive,
+including attribution, restrictions, third-party notices, trademarks,
+disclaimers, limitation of liability, and governing terms, survive termination.
+
+Licensor may reinstate your rights in writing or by accepting a cure, but
+reinstatement is not automatic unless required by applicable law.
+
+13. No Warranty
+
+THE LICENSED WORK IS PROVIDED "AS IS" AND "AS AVAILABLE", WITHOUT WARRANTY OF
+ANY KIND, EXPRESS, IMPLIED, STATUTORY, OR OTHERWISE, INCLUDING WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE, NON-INFRINGEMENT,
+SECURITY, ACCURACY, AVAILABILITY, OR ERROR-FREE OPERATION.
+
+14. Limitation of Liability
+
+TO THE MAXIMUM EXTENT PERMITTED BY LAW, LICENSOR AND ITS OFFICERS, EMPLOYEES,
+CONTRACTORS, CONTRIBUTORS, AND LICENSORS WILL NOT BE LIABLE FOR ANY INDIRECT,
+INCIDENTAL, SPECIAL, CONSEQUENTIAL, EXEMPLARY, PUNITIVE, OR BUSINESS-
+INTERRUPTION DAMAGES; LOST PROFITS; LOST REVENUE; LOST DATA; LOSS OF GOODWILL;
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; OR CLAIMS ARISING FROM USE,
+INABILITY TO USE, MODIFICATION, DISTRIBUTION, HOSTING, OR RELIANCE ON THE
+LICENSED WORK, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
+
+15. Compliance With Law
+
+You are responsible for complying with applicable laws, regulations, platform
+terms, export controls, privacy rules, accessibility rules, and intellectual
+property rights when using the Licensed Work.
+
+16. License Versions
+
+This license is version 1.0. Scriptriva Inc. may publish new versions of this
+license. A file explicitly licensed under this version remains under this
+version unless Licensor states otherwise in writing.
+
+17. Severability
+
+If any provision of this license is held unenforceable, the remaining provisions
+remain in effect. The unenforceable provision will be interpreted as closely as
+possible to its original purpose while remaining enforceable.
+
+18. Entire License
+
+This license is the complete agreement for the rights granted here. Any waiver,
+exception, special permission, or alternate license must be in writing from
+Licensor.
@@ -0,0 +1,274 @@
+# Seshat TTS
+
+![Scriptriva Inc.](resources/banner.jpg)
+
+<p align="center">
+  <img src="resources/logo.png" alt="Scriptriva Inc. logo" width="160">
+</p>
+
+[![Python 3.10-3.14](https://img.shields.io/badge/python-3.10--3.14-3776ab?logo=python&logoColor=white)](https://www.python.org/)
+[![Windows](https://img.shields.io/badge/platform-Windows-0078d4?logo=windows&logoColor=white)](https://www.microsoft.com/windows)
+[![License: SPSL 1.0](https://img.shields.io/badge/license-SPSL%201.0-green.svg)](LICENSE)
+[![Pocket TTS](https://img.shields.io/badge/voice-Kyutai%20Pocket%20TTS-111827)](https://github.com/kyutai-labs/pocket-tts)
+[![Tesseract OCR](https://img.shields.io/badge/OCR-Tesseract-4b5563)](https://github.com/tesseract-ocr/tesseract)
+
+Seshat TTS is a Windows GUI utility for realtime audio streaming for games, or apps. Pick a monitor or window, drag one capture region over the text, press one hotkey, and the selected text is extracted with Tesseract OCR or a local vision LLM, then streamed through Kyutai Pocket TTS.
+
+Maintained by Scriptriva Inc.
+
+For support inquiries email: support@scriptriva.com
+<p align="center">
+   <img src="resources/anime_meme.gif" alt="Anime girl studying">
+<p>
+
+
+## What It Does
+
+- Captures one selected screen region from a monitor or a chosen window.
+- Runs Tesseract OCR on that exact region, or sends the region image directly to a local vision-capable LLM for text extraction.
+- Streams the extracted text through Pocket TTS in realtime.
+- Lets you use a built-in Pocket TTS voice for speed or upload a custom WAV/MP3 reference voice.
+- Optionally routes OCR text through a local OpenAI-compatible LLM endpoint before speech.
+- Includes a 0-300% playback volume slider for quiet voices or noisy games.
+- Stops any active audio stream when a new read starts, so repeated hotkey presses do not overlap.
+- Caches custom voice state as `.safetensors` for faster repeat custom-voice reads when using the `uvx-server` backend.
+
+<p align="center">
+   <img src="resources/anime_yapping.gif" alt="Fast yapping mode">
+<p>
+
+## Requirements
+
+- Windows 10/11.
+- Python 3.10 through 3.14 when running from source or building.
+- Tesseract OCR for Windows when running from source or building a portable EXE with bundled OCR.
+- `uvx` when running from source, or when building a portable EXE with bundled uvx.
+- A working audio output device.
+
+Install Tesseract:
+
+```powershell
+winget install UB-Mannheim.TesseractOCR
+```
+
+Install `uvx`:
+
+```powershell
+powershell -ExecutionPolicy ByPass -c "irm https://astral.sh/uv/install.ps1 | iex"
+```
+
+Install Seshat TTS for development or for the fast launcher:
+
+```powershell
+python -m venv .venv
+.\.venv\Scripts\Activate.ps1
+python -m pip install -e .[test]
+```
+
+## Build Before Use
+
+For a single-file portable EXE, build with:
+
+```powershell
+.\scripts\build_exe.ps1
+```
+
+Portable output:
+
+```powershell
+.\dist\seshat-tts.exe
+```
+
+That EXE bundles the Seshat GUI/runtime files, app resources, `uvx.exe` if it is available on the build machine, and Tesseract OCR files if Tesseract is installed at `C:\Program Files\Tesseract-OCR`. You can override the OCR bundle source before building:
+
+```powershell
+$env:SESHAT_TESSERACT_DIR='D:\Tools\Tesseract-OCR'
+.\scripts\build_exe.ps1
+```
+
+For the old one-folder PyInstaller build:
+
+```powershell
+.\scripts\build_exe.ps1 -OneDir
+```
+
+One-folder output:
+
+```text
+dist\seshat-tts\seshat-tts.exe
+```
+
+The portable EXE still uses Pocket TTS through `uvx-server`. It does not freeze Torch/Pocket TTS inside the EXE because that path has been unreliable on Windows and can trigger native DLL initialization failures. First Pocket TTS use can still download/cache the Pocket TTS tool and model data under the user's normal cache directories, but no separate Python, Tesseract, or uvx install should be needed when those files were bundled during build.
+
+For a tiny development launcher, build:
+
+```powershell
+.\scripts\build_launcher_exe.ps1
+```
+
+Launcher output:
+
+```text
+dist\launcher\seshat-tts.exe
+```
+
+This launcher is intentionally small and quick to build. It uses the `.venv` in this project when present, so keep the virtual environment and installed dependencies beside the launcher.
+
+## Run From Source
+
+```powershell
+seshat-tts
+```
+
+For the fast launcher EXE, run:
+
+```powershell
+.\dist\launcher\seshat-tts.exe
+```
+
+The launcher expects dependencies in `.venv` or your active Python environment. It does not bundle Python, Torch, Pocket TTS, or Tesseract.
+
+## First-Time Setup
+
+1. Open Seshat TTS.
+2. Choose `monitor` or `window` capture mode.
+3. Select the monitor or window to watch.
+4. Click `Select Region`, then drag over the exact text area to read.
+5. Click inside `Read Hotkey` and press the key combo you want. The default is `ctrl+alt+n`.
+6. Click inside `Region Hotkey` and press the key combo you want. The default is `ctrl+alt+r`.
+7. Click inside `Stop Hotkey` and press the key combo you want. The default is `ctrl+alt+s`.
+8. Set `Tesseract` if it was not detected automatically.
+9. Choose a voice:
+   - `default` is fastest and uses a built-in Pocket TTS voice.
+   - `custom-wav` lets you choose a named WAV, MP3, or cached `.safetensors` reference voice.
+10. Adjust `Volume` if the generated voice is too quiet. `100%` is neutral; values above that boost and clip safely.
+11. Enable `Local LLM` if you want OCR text cleaned by a local OpenAI-compatible server before TTS.
+12. Enable `Use local LLM vision instead of Tesseract OCR` only when your local model endpoint supports image input and you want the LLM to read the selected region directly.
+13. Click `Preload TTS` once before playing if you want the first read to be less delayed.
+14. Press the read hotkey whenever the selected text should be spoken, or the stop hotkey whenever playback should stop.
+
+Use borderless/windowed mode for games if exclusive fullscreen capture returns stale or blank frames.
+
+## Local LLM
+
+The `Local LLM` panel can use an OpenAI-compatible endpoint in two ways:
+
+- `Route OCR through local OpenAI-compatible LLM` keeps Tesseract as the text extractor, then asks the local model to clean the parsed text before TTS.
+- `Use local LLM vision instead of Tesseract OCR` skips Tesseract and sends the selected region image to the local model as a PNG data URL. This requires a vision-capable OpenAI-compatible model endpoint.
+
+Typical values:
+
+```text
+Base URL: http://127.0.0.1:8000/v1
+API Key: local key or token
+Model: the model name exposed by your local server
+```
+
+`Load api_key.txt` fills the API key field from a repo-local `api_key.txt` file if present. Treat that file as a secret and do not commit it. Lower timeout and max token values reduce latency; no network or LLM path can be truly zero-latency, but a local endpoint keeps this as short as the model server allows.
+
+`Disable thinking` is enabled by default. It sends common OpenAI-compatible metadata for local reasoning models, including `chat_template_kwargs.enable_thinking=false`, so models that support that switch skip reasoning output and return faster.
+
+## Voice Modes
+
+`default` voice mode is the fastest. Pick a built-in voice such as `alba`, `marius`, `anna`, `vera`, or `george`.
+
+`custom-wav` mode accepts `.wav`, `.mp3`, and cached `.safetensors` voice files. MP3 references are converted once into cached WAV files before Pocket TTS processes them. Use `Manage` beside `Custom Voice` to name voices, save them, and select them from the dropdown.
+
+The first custom-voice run can be slow because Pocket TTS must convert the reference audio into a voice state. Seshat TTS caches that state under:
+
+```text
+%USERPROFILE%\.seshat-tts\voices
+```
+
+After that cache exists, the `uvx-server` backend sends a reusable local `voice_url` instead of uploading and reprocessing the same audio every time. Named custom voices are stored in:
+
+```text
+%USERPROFILE%\.seshat-tts\voice_profiles.json
+```
+
+Pocket TTS voice cloning may require Hugging Face access:
+
+1. Request access on [Kyutai's Pocket TTS Hugging Face page](https://huggingface.co/kyutai/pocket-tts).
+2. Create a token at [Hugging Face tokens](https://huggingface.co/settings/tokens).
+3. Login for `uvx`:
+
+```powershell
+uvx hf auth login --force
+```
+
+## Build Commands
+
+Fast launcher build, usually under a minute:
+
+```powershell
+.\scripts\build_launcher_exe.ps1
+```
+
+Output:
+
+```text
+dist\launcher\seshat-tts.exe
+```
+
+Full dependency-bundled PyInstaller build:
+
+```powershell
+.\scripts\build_exe.ps1
+```
+
+Output:
+
+```text
+dist\seshat-tts.exe
+```
+
+Use the fast launcher during development and for local use. Use the portable build when you need to move the app to a machine where Python, Tesseract, and uvx are not installed.
+
+The `python-api` backend is only shown when running from source or the fast launcher. The bundled PyInstaller EXE only exposes `uvx-server`.
+
+## License and Reuse
+
+Seshat TTS is released under the [Scriptriva Public Source License 1.0](LICENSE).
+
+Copyright (c) 2026 Scriptriva Inc.
+
+Commercial use is allowed under the license terms. The license preserves attribution, third-party notices, Scriptriva branding rights, safety restrictions, and restrictions on reusing the licensed work to create or distribute a same-functionality product.
+
+Useful reuse boundaries:
+
+- `src/seshat_tts/capture.py`: monitor/window capture helpers.
+- `src/seshat_tts/ocr.py`: OCR preprocessing and text extraction.
+- `src/seshat_tts/tts.py`: Pocket TTS server/API playback adapters and stream cancellation.
+- `src/seshat_tts/llm.py`: OpenAI-compatible local LLM cleanup step.
+- `src/seshat_tts/config.py`: persisted GUI/runtime configuration.
+- `src/seshat_tts/region_picker.py`: snipping-tool-style region selection.
+
+Security and privacy considerations for reuse:
+
+- Treat OCR text, API keys, custom voice files, and generated voice caches as user data.
+- Do not commit `api_key.txt`, voice samples, `.safetensors` voice caches, or local config files.
+- Custom voice cloning should be used only with audio you have permission to use.
+- The portable EXE may bundle third-party binaries; keep their notices and license terms intact.
+
+
+
+## Third-Party Notices
+
+Seshat TTS uses and/or interfaces with these third-party projects. Each project remains under its own license:
+
+| Component | Purpose | License | Notes |
+| --- | --- | --- | --- |
+| [Kyutai Pocket TTS](https://github.com/kyutai-labs/pocket-tts) | Local text-to-speech generation and voice cloning | MIT | The Pocket TTS GitHub repository identifies the project as MIT licensed. Model/voice assets may have separate terms; review the linked Hugging Face pages before redistribution. |
+| [Tesseract OCR](https://tesseractocr.org/) | OCR engine used to extract text from selected screen regions | Apache License 2.0 | Tesseract is not MIT licensed. Its project site identifies it as Apache 2.0 licensed. |
+| [pytesseract](https://github.com/madmaze/pytesseract) | Python wrapper for Tesseract | Apache License 2.0 | Used to invoke the Tesseract executable from Python. |
+| [PyInstaller](https://pyinstaller.org/) | Windows executable packaging | GPLv2-or-later with bootloader exception | Used only for building packaged executables. |
+| [OpenAI Python SDK](https://github.com/openai/openai-python) | OpenAI-compatible local LLM client | Apache License 2.0 | Used for optional local LLM cleanup through OpenAI-compatible endpoints. |
+
+Packaged builds include [THIRD_PARTY_NOTICES.md](THIRD_PARTY_NOTICES.md), including a link to the [Pocket TTS MIT license](https://github.com/kyutai-labs/pocket-tts/blob/main/LICENSE).
+
+## Tests
+
+```powershell
+$env:PYTHONPATH='src'
+python -m pytest -q
+```
@@ -0,0 +1,37 @@
+# Security Policy
+
+## Supported Versions
+
+The `main` branch is the only supported development line until formal releases begin.
+
+## Reporting a Vulnerability
+
+Email support@scriptriva.com with:
+
+- A concise description of the issue.
+- Steps to reproduce.
+- Affected version or commit.
+- Impact and likely abuse path.
+- Any proof-of-concept files or screenshots that are safe to share.
+
+Please do not disclose vulnerabilities publicly until maintainers have had time to investigate and coordinate a fix.
+
+## Sensitive Data
+
+Do not commit:
+
+- `api_key.txt`
+- Hugging Face or OpenAI-compatible API tokens
+- custom voice samples
+- cached `.safetensors` voice states
+- generated executable artifacts
+- local config files
+
+## Security-Relevant Areas
+
+- Screen/window capture.
+- OCR text handling.
+- Local LLM endpoint configuration.
+- Custom voice file handling.
+- TTS server startup and subprocess execution.
+- Packaged binary contents.
@@ -0,0 +1,22 @@
+# Support
+
+For support inquiries email: support@scriptriva.com
+
+## Before Asking
+
+- Check the README setup steps.
+- Confirm Tesseract OCR is available or bundled.
+- Confirm `uvx` can start Pocket TTS.
+- Try the latest build from `main`.
+- Include the exact error text shown in the GUI.
+
+## Useful Details
+
+When reporting a problem, include:
+
+- Windows version.
+- App version or commit.
+- Whether you use monitor or window capture.
+- Whether you use default voice or custom voice.
+- Whether Local LLM is enabled.
+- Steps to reproduce.
@@ -0,0 +1,45 @@
+# Third-Party Notices
+
+Seshat TTS uses and/or interfaces with third-party software. These projects retain their own licenses.
+
+## Kyutai Pocket TTS
+
+Repository: https://github.com/kyutai-labs/pocket-tts
+
+License: MIT
+
+License file: https://github.com/kyutai-labs/pocket-tts/blob/main/LICENSE
+
+Pocket TTS is used for local text-to-speech generation and voice cloning through the `uvx-server` backend. The Pocket TTS software license is MIT. Model weights, hosted files, or voice assets may have separate terms; review the upstream model and hosting pages before redistributing those assets.
+
+## Tesseract OCR
+
+Project site: https://tesseractocr.org/
+
+License: Apache License 2.0
+
+Tesseract OCR is used to extract text from selected screen regions.
+
+## pytesseract
+
+Repository: https://github.com/madmaze/pytesseract
+
+License: Apache License 2.0
+
+pytesseract is used as the Python wrapper around the Tesseract executable.
+
+## PyInstaller
+
+Project site: https://pyinstaller.org/
+
+License: GPLv2-or-later with bootloader exception
+
+PyInstaller is used to build the Windows executable.
+
+## OpenAI Python SDK
+
+Repository: https://github.com/openai/openai-python
+
+License: Apache License 2.0
+
+The OpenAI Python SDK is used to call optional OpenAI-compatible local LLM endpoints.
@@ -0,0 +1,33 @@
+# Architecture
+
+Seshat TTS is a Windows desktop utility that converts selected on-screen text into streamed speech.
+
+## Runtime Flow
+
+1. User selects a monitor or window.
+2. User selects a screen region.
+3. Hotkey triggers capture.
+4. Tesseract OCR extracts text from the selected region, unless LLM image extraction is enabled.
+5. Optional local OpenAI-compatible LLM cleans the OCR text, or a vision-capable local LLM extracts text directly from the captured region image.
+6. Pocket TTS streams speech.
+7. New hotkey captures stop any active stream and start a fresh one.
+8. Stop hotkey cancels active playback without starting another capture.
+
+## Key Modules
+
+- `src/seshat_tts/app.py`: Tk GUI, settings flow, hotkey orchestration.
+- `src/seshat_tts/capture.py`: monitor/window capture.
+- `src/seshat_tts/region_picker.py`: snipping-tool-style region selection.
+- `src/seshat_tts/ocr.py`: image preprocessing and Tesseract OCR.
+- `src/seshat_tts/tts.py`: Pocket TTS API/server playback.
+- `src/seshat_tts/llm.py`: OpenAI-compatible local LLM cleanup and vision-based image text extraction.
+- `src/seshat_tts/config.py`: persisted config loading and migration.
+- `src/seshat_tts/voices.py`: named custom voice profiles.
+
+## Packaging
+
+`scripts/build_exe.ps1` builds a portable PyInstaller executable using `seshat-tts-portable.spec`.
+
+The packaged EXE includes first-party resources, project license, third-party notices, bundled OCR files when Tesseract is installed on the build machine, and `uvx.exe` when found.
+
+Pocket TTS runs through `uvx-server` in bundled builds to avoid freezing Torch and its native dependencies into the app.
@@ -0,0 +1,57 @@
+[project]
+name = "seshat-tts"
+version = "0.1.0"
+description = "Hotkey-triggered screen OCR to Pocket TTS."
+requires-python = ">=3.10,<3.15"
+readme = "README.md"
+license = { file = "LICENSE" }
+authors = [
+    { name = "Scriptriva Inc.", email = "support@scriptriva.com" },
+]
+keywords = ["ocr", "tts", "accessibility", "screen-capture", "pocket-tts"]
+classifiers = [
+    "Development Status :: 3 - Alpha",
+    "Environment :: Win32 (MS Windows)",
+    "Intended Audience :: End Users/Desktop",
+    "Operating System :: Microsoft :: Windows",
+    "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.13",
+    "Topic :: Multimedia :: Sound/Audio :: Speech",
+]
+dependencies = [
+    "keyboard>=0.13.5",
+    "mss>=9.0.1",
+    "pillow>=10.0.0",
+    "pytesseract>=0.3.13",
+    "openai>=1.0.0",
+    "imageio-ffmpeg>=0.5.1",
+    "requests>=2.20.0",
+    "sounddevice>=0.4.6",
+    "pocket-tts>=2.1.0",
+    "pywin32>=306; platform_system == 'Windows'",
+]
+
+[project.optional-dependencies]
+test = [
+    "pytest>=8.0.0",
+]
+build = [
+    "pyinstaller>=6.0.0",
+]
+
+[project.scripts]
+seshat-tts = "seshat_tts.app:main"
+
+[project.urls]
+Homepage = "https://github.com/Scriptriva/seshat-tts"
+Repository = "https://github.com/Scriptriva/seshat-tts"
+Issues = "https://github.com/Scriptriva/seshat-tts/issues"
+Support = "mailto:support@scriptriva.com"
+
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[tool.hatch.build.targets.wheel]
+packages = ["src/seshat_tts"]
@@ -0,0 +1,55 @@
+param(
+    [switch]$SkipInstall,
+    [switch]$OneDir
+)
+
+$ErrorActionPreference = "Stop"
+$Root = Split-Path -Parent $PSScriptRoot
+Set-Location $Root
+$DistApp = Join-Path $Root "dist\seshat-tts"
+$DistExe = Join-Path $Root "dist\seshat-tts.exe"
+
+$RunningApps = @(Get-Process -Name "seshat-tts" -ErrorAction SilentlyContinue)
+foreach ($Process in $RunningApps) {
+    try {
+        Stop-Process -Id $Process.Id -Force -ErrorAction Stop
+    } catch {
+        & taskkill.exe /PID $Process.Id /T /F | Out-Null
+    }
+}
+Start-Sleep -Milliseconds 500
+$StillRunning = @(Get-Process -Name "seshat-tts" -ErrorAction SilentlyContinue)
+if ($StillRunning.Count -gt 0) {
+    $Ids = ($StillRunning | ForEach-Object { $_.Id }) -join ", "
+    throw "Close Seshat TTS before building. Could not stop running process id(s): $Ids"
+}
+foreach ($Target in @($DistApp, $DistExe)) {
+    if (Test-Path $Target) {
+        for ($Attempt = 1; $Attempt -le 5; $Attempt++) {
+            try {
+                Remove-Item -LiteralPath $Target -Recurse -Force
+                break
+            } catch {
+                if ($Attempt -eq 5) {
+                    throw
+                }
+                Start-Sleep -Seconds 2
+            }
+        }
+    }
+}
+
+if (-not $SkipInstall) {
+    python -m pip install --upgrade pip
+    python -m pip install -e ".[build,test]"
+}
+
+if ($OneDir) {
+    python -m PyInstaller --clean --noconfirm seshat-tts.spec
+    Write-Host "Built: $Root\dist\seshat-tts\seshat-tts.exe"
+} else {
+    python -m PyInstaller --clean --noconfirm seshat-tts-portable.spec
+    Write-Host "Built portable EXE: $Root\dist\seshat-tts.exe"
+}
+
+Write-Host "The portable EXE bundles the GUI runtime, bundled OCR files when Tesseract is installed on this build machine, and uvx.exe when found."
@@ -0,0 +1,111 @@
+$ErrorActionPreference = "Stop"
+$Root = Split-Path -Parent $PSScriptRoot
+$OutDir = Join-Path $Root "dist\launcher"
+$OutExe = Join-Path $OutDir "seshat-tts.exe"
+$BuildDir = Join-Path $Root "build\launcher"
+$Source = Join-Path $BuildDir "SeshatTtsLauncher.cs"
+$Project = Join-Path $BuildDir "SeshatTtsLauncher.csproj"
+$Icon = Join-Path $Root "resources\seshat-tts.ico"
+$BuildIcon = Join-Path $BuildDir "seshat-tts.ico"
+
+New-Item -ItemType Directory -Force -Path $OutDir | Out-Null
+New-Item -ItemType Directory -Force -Path $BuildDir | Out-Null
+Get-ChildItem -Path (Join-Path $BuildDir "*") -File -Include "*.cs", "*.csproj" | Remove-Item -Force
+foreach ($GeneratedDir in @("bin", "obj")) {
+    $Path = Join-Path $BuildDir $GeneratedDir
+    if (Test-Path $Path) {
+        Remove-Item -Path $Path -Recurse -Force
+    }
+}
+if (Test-Path $Icon) {
+    Copy-Item -Path $Icon -Destination $BuildIcon -Force
+}
+
+@'
+using System;
+using System.Diagnostics;
+using System.IO;
+
+public static class SeshatTtsLauncher
+{
+    public static int Main(string[] args)
+    {
+        string exeDir = AppDomain.CurrentDomain.BaseDirectory;
+        string root = Path.GetFullPath(Path.Combine(exeDir, "..", ".."));
+        string venvPythonw = Path.Combine(root, ".venv", "Scripts", "pythonw.exe");
+        string venvPython = Path.Combine(root, ".venv", "Scripts", "python.exe");
+
+        string python = File.Exists(venvPythonw) ? venvPythonw :
+            File.Exists(venvPython) ? venvPython :
+            "pythonw.exe";
+
+        string arguments = "-m seshat_tts";
+        if (args.Length > 0)
+        {
+            arguments += " " + string.Join(" ", Array.ConvertAll(args, Quote));
+        }
+
+        ProcessStartInfo start = new ProcessStartInfo();
+        start.FileName = python;
+        start.Arguments = arguments;
+        start.WorkingDirectory = root;
+        start.UseShellExecute = false;
+        start.CreateNoWindow = true;
+        start.EnvironmentVariables["PYTHONPATH"] = Path.Combine(root, "src");
+
+        try
+        {
+            Process process = Process.Start(start);
+            return process == null ? 1 : 0;
+        }
+        catch
+        {
+            start.FileName = "python.exe";
+            start.CreateNoWindow = false;
+            Process process = Process.Start(start);
+            return process == null ? 1 : 0;
+        }
+    }
+
+    private static string Quote(string value)
+    {
+        if (value.IndexOfAny(new char[] {' ', '\t', '"'}) < 0)
+        {
+            return value;
+        }
+        return "\"" + value.Replace("\"", "\\\"") + "\"";
+    }
+}
+'@ | Set-Content -Path $Source -Encoding UTF8
+
+$compiler = Get-Command csc.exe -ErrorAction SilentlyContinue
+if ($compiler) {
+    $iconArg = if (Test-Path $Icon) { "/win32icon:$Icon" } else { $null }
+    & $compiler.Source /nologo /target:winexe /out:$OutExe $iconArg $Source
+} else {
+    $dotnet = Get-Command dotnet.exe -ErrorAction SilentlyContinue
+    if (-not $dotnet) {
+        throw "No C# compiler found. Install the .NET SDK or add csc.exe to PATH."
+    }
+
+@'
+<Project Sdk="Microsoft.NET.Sdk">
+  <PropertyGroup>
+    <OutputType>WinExe</OutputType>
+    <TargetFramework>net10.0-windows</TargetFramework>
+    <ImplicitUsings>disable</ImplicitUsings>
+    <Nullable>disable</Nullable>
+    <AssemblyName>seshat-tts</AssemblyName>
+    <ApplicationIcon>seshat-tts.ico</ApplicationIcon>
+  </PropertyGroup>
+  <ItemGroup>
+    <Content Include="seshat-tts.ico" />
+  </ItemGroup>
+</Project>
+'@ | Set-Content -Path $Project -Encoding UTF8
+
+    dotnet publish $Project -c Release -o $OutDir --nologo
+}
+
+Write-Host "Built fast launcher: $OutExe"
+Write-Host "This launcher expects dependencies installed in .venv or the active Python environment."
@@ -0,0 +1,7 @@
+from __future__ import annotations
+
+from seshat_tts.app import main
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,94 @@
+# -*- mode: python ; coding: utf-8 -*-
+
+import os
+import shutil
+from pathlib import Path
+
+ROOT = Path.cwd()
+
+datas = []
+binaries = []
+
+resources = ROOT / "resources"
+if resources.exists():
+    datas.append((str(resources), "resources"))
+
+license_file = ROOT / "LICENSE"
+if license_file.exists():
+    datas.append((str(license_file), "."))
+
+third_party_notices = ROOT / "THIRD_PARTY_NOTICES.md"
+if third_party_notices.exists():
+    datas.append((str(third_party_notices), "."))
+
+tesseract_dir = Path(os.environ.get("SESHAT_TESSERACT_DIR", r"C:\Program Files\Tesseract-OCR"))
+if tesseract_dir.exists():
+    datas.append((str(tesseract_dir), "tesseract"))
+
+for tool_name in ("uvx.exe", "uv.exe"):
+    tool = shutil.which(tool_name)
+    if tool:
+        binaries.append((tool, "tools"))
+
+a = Analysis(
+    ["scripts/pyinstaller_entry.py"],
+    pathex=[str(ROOT / "src")],
+    binaries=binaries,
+    datas=datas,
+    hiddenimports=[
+        "pytesseract",
+        "mss",
+        "keyboard",
+        "sounddevice",
+        "imageio_ffmpeg",
+        "openai",
+        "win32gui",
+        "win32con",
+        "win32ui",
+    ],
+    hookspath=[],
+    hooksconfig={},
+    runtime_hooks=[],
+    excludes=[
+        "pocket_tts",
+        "torch",
+        "torchaudio",
+        "torchvision",
+        "torchao",
+        "xformers",
+        "triton",
+        "bitsandbytes",
+        "pandas",
+        "scipy",
+        "matplotlib",
+        "pyarrow",
+        "numba",
+        "llvmlite",
+        "pytest",
+        "IPython",
+        "jupyter",
+    ],
+    noarchive=False,
+    optimize=0,
+)
+pyz = PYZ(a.pure)
+
+exe = EXE(
+    pyz,
+    a.scripts,
+    a.binaries,
+    a.datas,
+    [],
+    name="seshat-tts",
+    debug=False,
+    bootloader_ignore_signals=False,
+    strip=False,
+    upx=True,
+    console=False,
+    disable_windowed_traceback=False,
+    argv_emulation=False,
+    target_arch=None,
+    codesign_identity=None,
+    entitlements_file=None,
+    icon=str(ROOT / "resources" / "seshat-tts.ico"),
+)
@@ -0,0 +1,80 @@
+# -*- mode: python ; coding: utf-8 -*-
+
+from pathlib import Path
+
+ROOT = Path.cwd()
+
+a = Analysis(
+    ["scripts/pyinstaller_entry.py"],
+    pathex=[str(ROOT / "src")],
+    binaries=[],
+    datas=(
+        ([(str(ROOT / "resources"), "resources")] if (ROOT / "resources").exists() else [])
+        + ([(str(ROOT / "LICENSE"), ".")] if (ROOT / "LICENSE").exists() else [])
+        + ([(str(ROOT / "THIRD_PARTY_NOTICES.md"), ".")] if (ROOT / "THIRD_PARTY_NOTICES.md").exists() else [])
+    ),
+    hiddenimports=[
+        "pytesseract",
+        "mss",
+        "keyboard",
+        "sounddevice",
+        "imageio_ffmpeg",
+        "openai",
+        "win32gui",
+        "win32con",
+        "win32ui",
+    ],
+    hookspath=[],
+    hooksconfig={},
+    runtime_hooks=[],
+    excludes=[
+        "pocket_tts",
+        "torch",
+        "torchaudio",
+        "torchvision",
+        "torchao",
+        "xformers",
+        "triton",
+        "bitsandbytes",
+        "pandas",
+        "scipy",
+        "matplotlib",
+        "pyarrow",
+        "numba",
+        "llvmlite",
+        "pytest",
+        "IPython",
+        "jupyter",
+    ],
+    noarchive=False,
+    optimize=0,
+)
+pyz = PYZ(a.pure)
+
+exe = EXE(
+    pyz,
+    a.scripts,
+    [],
+    exclude_binaries=True,
+    name="seshat-tts",
+    debug=False,
+    bootloader_ignore_signals=False,
+    strip=False,
+    upx=True,
+    console=False,
+    disable_windowed_traceback=False,
+    argv_emulation=False,
+    target_arch=None,
+    codesign_identity=None,
+    entitlements_file=None,
+    icon=str(ROOT / "resources" / "seshat-tts.ico"),
+)
+coll = COLLECT(
+    exe,
+    a.binaries,
+    a.datas,
+    strip=False,
+    upx=True,
+    upx_exclude=[],
+    name="seshat-tts",
+)
@@ -0,0 +1,5 @@
+"""Hotkey OCR capture to Pocket TTS."""
+
+__all__ = ["__version__"]
+
+__version__ = "0.1.0"
@@ -0,0 +1,6 @@
+from .app import main
+
+
+if __name__ == "__main__":
+    main()
+
@@ -0,0 +1,116 @@
+from __future__ import annotations
+
+import ctypes
+from dataclasses import dataclass
+
+import mss
+from PIL import Image
+import win32gui
+import win32ui
+
+from .config import Rect
+
+
+@dataclass(frozen=True, slots=True)
+class MonitorInfo:
+    index: int
+    left: int
+    top: int
+    width: int
+    height: int
+
+    @property
+    def label(self) -> str:
+        return f"{self.index}: {self.width}x{self.height} at {self.left},{self.top}"
+
+
+def list_monitors() -> list[MonitorInfo]:
+    with mss.mss() as sct:
+        return [
+            MonitorInfo(
+                index=index,
+                left=int(monitor["left"]),
+                top=int(monitor["top"]),
+                width=int(monitor["width"]),
+                height=int(monitor["height"]),
+            )
+            for index, monitor in enumerate(sct.monitors)
+            if index != 0
+        ]
+
+
+def capture_absolute_region(left: int, top: int, width: int, height: int) -> Image.Image:
+    with mss.mss() as sct:
+        grab = {
+            "left": left,
+            "top": top,
+            "width": width,
+            "height": height,
+        }
+        shot = sct.grab(grab)
+        return Image.frombytes("RGB", shot.size, shot.rgb)
+
+
+def capture_monitor_region(monitor_index: int, rect: Rect) -> Image.Image:
+    with mss.mss() as sct:
+        if monitor_index <= 0 or monitor_index >= len(sct.monitors):
+            raise ValueError(f"Monitor {monitor_index} is not available.")
+        monitor = sct.monitors[monitor_index]
+        return capture_absolute_region(
+            int(monitor["left"]) + rect.left,
+            int(monitor["top"]) + rect.top,
+            rect.width,
+            rect.height,
+        )
+
+
+def capture_window_region(hwnd: int, rect: Rect) -> Image.Image:
+    image = capture_window(hwnd)
+    if rect.left < 0 or rect.top < 0 or rect.width <= 0 or rect.height <= 0:
+        raise ValueError("Capture region must be inside the selected window.")
+    if rect.left + rect.width > image.width or rect.top + rect.height > image.height:
+        raise ValueError("Capture region is outside the selected window. Select the region again in window mode.")
+    return image.crop((rect.left, rect.top, rect.left + rect.width, rect.top + rect.height))
+
+
+def capture_window(hwnd: int) -> Image.Image:
+    left, top, right, bottom = win32gui.GetWindowRect(hwnd)
+    width = right - left
+    height = bottom - top
+    if width <= 0 or height <= 0:
+        raise ValueError("Selected window has no capturable size.")
+
+    hwnd_dc = win32gui.GetWindowDC(hwnd)
+    source_dc = win32ui.CreateDCFromHandle(hwnd_dc)
+    memory_dc = source_dc.CreateCompatibleDC()
+    bitmap = win32ui.CreateBitmap()
+    bitmap.CreateCompatibleBitmap(source_dc, width, height)
+    memory_dc.SelectObject(bitmap)
+
+    try:
+        result = _print_window(hwnd, memory_dc.GetSafeHdc(), 2)
+        if result != 1:
+            result = _print_window(hwnd, memory_dc.GetSafeHdc(), 0)
+        if result != 1:
+            raise RuntimeError("PrintWindow failed for the selected window.")
+
+        info = bitmap.GetInfo()
+        bits = bitmap.GetBitmapBits(True)
+        return Image.frombuffer(
+            "RGB",
+            (info["bmWidth"], info["bmHeight"]),
+            bits,
+            "raw",
+            "BGRX",
+            0,
+            1,
+        ).copy()
+    finally:
+        win32gui.DeleteObject(bitmap.GetHandle())
+        memory_dc.DeleteDC()
+        source_dc.DeleteDC()
+        win32gui.ReleaseDC(hwnd, hwnd_dc)
+
+
+def _print_window(hwnd: int, hdc: int, flags: int) -> int:
+    return int(ctypes.windll.user32.PrintWindow(hwnd, hdc, flags))
@@ -0,0 +1,125 @@
+from __future__ import annotations
+
+import json
+import sys
+from dataclasses import asdict, dataclass, field
+from pathlib import Path
+from typing import Any
+
+from .tesseract import find_tesseract
+
+
+APP_DIR = Path.home() / ".seshat-tts"
+CONFIG_PATH = APP_DIR / "config.json"
+
+
+@dataclass(slots=True)
+class Rect:
+    left: int = 0
+    top: int = 25
+    width: int = 720
+    height: int = 305
+
+
+@dataclass(slots=True)
+class AppConfig:
+    capture_mode: str = "monitor"
+    monitor_index: int = 1
+    window_title: str = ""
+    hotkey: str = "ctrl+alt+n"
+    capture_region_hotkey: str = "ctrl+alt+r"
+    stop_hotkey: str = "ctrl+alt+s"
+    dialogue_rect: Rect = field(default_factory=Rect)
+    tesseract_cmd: str = field(default_factory=find_tesseract)
+    voice_source: str = "default"
+    default_voice: str = "alba"
+    custom_voice_name: str = ""
+    voice_path: str = ""
+    language: str = "english"
+    quantize_tts: bool = False
+    volume_gain: float = 1.0
+    tts_backend: str = "uvx-server"
+    tts_host: str = "localhost"
+    tts_port: int = 8000
+    llm_enabled: bool = False
+    llm_base_url: str = "http://127.0.0.1:8000/v1"
+    llm_api_key: str = ""
+    llm_model: str = "current"
+    llm_timeout: float = 5.0
+    llm_max_tokens: int = 256
+    llm_disable_thinking: bool = True
+    llm_image_extraction: bool = False
+    llm_system_prompt: str = (
+        "Clean OCR text for text-to-speech. Return only the corrected text. "
+        "Do not explain, add commentary, summarize, or change the meaning."
+    )
+    last_text: str = ""
+
+
+def _rect_from_dict(value: dict[str, Any] | None) -> Rect:
+    if not value:
+        return Rect()
+    return Rect(**{field: int(value.get(field, getattr(Rect(), field))) for field in Rect.__dataclass_fields__})
+
+
+def _clean_last_text(value: Any) -> str:
+    lines = str(value or "").splitlines()
+    cleaned = [
+        line
+        for line in lines
+        if not line.strip().casefold().startswith(("capture region:", "text region:"))
+    ]
+    return "\n".join(cleaned).strip()
+
+
+def _tesseract_from_config(value: Any) -> str:
+    detected = find_tesseract()
+    if getattr(sys, "frozen", False) and detected:
+        return detected
+    return str(value or detected)
+
+
+def load_config(path: Path = CONFIG_PATH) -> AppConfig:
+    if not path.exists():
+        return AppConfig()
+    data = json.loads(path.read_text(encoding="utf-8"))
+    return AppConfig(
+        capture_mode=str(data.get("capture_mode", "monitor")),
+        monitor_index=int(data.get("monitor_index", 1)),
+        window_title=str(data.get("window_title", "")),
+        hotkey=str(data.get("hotkey", "ctrl+alt+n")),
+        capture_region_hotkey=str(data.get("capture_region_hotkey", "ctrl+alt+r")),
+        stop_hotkey=str(data.get("stop_hotkey", "ctrl+alt+s")),
+        dialogue_rect=_rect_from_dict(data.get("dialogue_rect")),
+        tesseract_cmd=_tesseract_from_config(data.get("tesseract_cmd")),
+        voice_source=str(data.get("voice_source", "default")),
+        default_voice=str(data.get("default_voice", "alba")),
+        custom_voice_name=str(data.get("custom_voice_name", "")),
+        voice_path=str(data.get("voice_path", "")),
+        language="english",
+        quantize_tts=bool(data.get("quantize_tts", False)),
+        volume_gain=float(data.get("volume_gain", 1.0)),
+        tts_backend=str(data.get("tts_backend", "uvx-server")),
+        tts_host=str(data.get("tts_host", "localhost")),
+        tts_port=int(data.get("tts_port", 8000)),
+        llm_enabled=bool(data.get("llm_enabled", False)),
+        llm_base_url=str(data.get("llm_base_url", "http://127.0.0.1:8000/v1")),
+        llm_api_key=str(data.get("llm_api_key", "")),
+        llm_model=str(data.get("llm_model", "unsloth")),
+        llm_timeout=float(data.get("llm_timeout", 5.0)),
+        llm_max_tokens=int(data.get("llm_max_tokens", 256)),
+        llm_disable_thinking=bool(data.get("llm_disable_thinking", True)),
+        llm_image_extraction=bool(data.get("llm_image_extraction", False)),
+        llm_system_prompt=str(
+            data.get(
+                "llm_system_prompt",
+                AppConfig.__dataclass_fields__["llm_system_prompt"].default,
+            )
+        ),
+        last_text=_clean_last_text(data.get("last_text", "")),
+    )
+
+
+def save_config(config: AppConfig, path: Path = CONFIG_PATH) -> None:
+    path.parent.mkdir(parents=True, exist_ok=True)
+    path.write_text(json.dumps(asdict(config), indent=2), encoding="utf-8")
@@ -0,0 +1,30 @@
+from __future__ import annotations
+
+from collections.abc import Callable
+
+import keyboard
+
+
+class HotkeyManager:
+    def __init__(self) -> None:
+        self._handles: dict[str, object] = {}
+
+    def register(self, name: str, hotkey: str, callback: Callable[[], None]) -> None:
+        self.unregister(name)
+        if not hotkey.strip():
+            return
+        self._handles[name] = keyboard.add_hotkey(hotkey, callback, suppress=False, trigger_on_release=False)
+
+    def unregister(self, name: str | None = None) -> None:
+        if name is not None:
+            handle = self._handles.pop(name, None)
+            if handle is not None:
+                keyboard.remove_hotkey(handle)
+            return
+        for handle in self._handles.values():
+            keyboard.remove_hotkey(handle)
+        self._handles.clear()
+
+
+def listen_for_hotkey() -> str:
+    return keyboard.read_hotkey(suppress=False)
@@ -0,0 +1,143 @@
+from __future__ import annotations
+
+import base64
+from io import BytesIO
+from pathlib import Path
+from typing import Protocol
+
+from PIL import Image
+
+
+DEFAULT_API_KEY_PATH = Path.home() / ".seshat-tts" / "llm_api_key.txt"
+IMAGE_EXTRACTION_SYSTEM_PROMPT = (
+    "Extract only the visible readable text from the supplied image for text-to-speech. "
+    "Preserve the original wording and sentence order. Do not describe the image, "
+    "do not add commentary, and do not include UI labels unless they are part of the text to read."
+)
+IMAGE_EXTRACTION_USER_PROMPT = "Read the text in this selected screen region and return only that text."
+
+
+class _ChatCompletions(Protocol):
+    def create(self, **kwargs: object) -> object: ...
+
+
+class _Chat(Protocol):
+    completions: _ChatCompletions
+
+
+class _OpenAIClient(Protocol):
+    chat: _Chat
+
+
+def load_api_key_file(path: Path = DEFAULT_API_KEY_PATH) -> str:
+    if not path.exists():
+        return ""
+    return path.read_text(encoding="utf-8").strip()
+
+
+def process_text_with_llm(
+    text: str,
+    *,
+    enabled: bool,
+    base_url: str,
+    api_key: str,
+    model: str,
+    system_prompt: str,
+    timeout: float = 5.0,
+    max_tokens: int = 256,
+    disable_thinking: bool = True,
+    client: _OpenAIClient | None = None,
+) -> str:
+    text = text.strip()
+    if not enabled or not text:
+        return text
+
+    if client is None:
+        from openai import OpenAI
+
+        client = OpenAI(
+            api_key=api_key.strip() or "local",
+            base_url=base_url.strip(),
+            timeout=max(0.1, float(timeout)),
+        )
+
+    request: dict[str, object] = {
+        "model": model.strip(),
+        "messages": [
+            {"role": "system", "content": system_prompt.strip()},
+            {"role": "user", "content": text},
+        ],
+        "temperature": 0,
+        "max_tokens": max(1, int(max_tokens)),
+        "stream": False,
+    }
+    if disable_thinking:
+        request["extra_body"] = {
+            "chat_template_kwargs": {"enable_thinking": False},
+            "enable_thinking": False,
+            "reasoning_effort": "none",
+        }
+
+    response = client.chat.completions.create(**request)
+    content = response.choices[0].message.content
+    return str(content or "").strip() or text
+
+
+def process_image_with_llm(
+    image: Image.Image,
+    *,
+    base_url: str,
+    api_key: str,
+    model: str,
+    timeout: float = 5.0,
+    max_tokens: int = 256,
+    disable_thinking: bool = True,
+    client: _OpenAIClient | None = None,
+) -> str:
+    if client is None:
+        from openai import OpenAI
+
+        client = OpenAI(
+            api_key=api_key.strip() or "local",
+            base_url=base_url.strip(),
+            timeout=max(0.1, float(timeout)),
+        )
+
+    request: dict[str, object] = {
+        "model": model.strip(),
+        "messages": [
+            {"role": "system", "content": IMAGE_EXTRACTION_SYSTEM_PROMPT},
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": IMAGE_EXTRACTION_USER_PROMPT},
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": f"data:image/png;base64,{_image_to_base64_png(image)}",
+                            "detail": "high",
+                        },
+                    },
+                ],
+            },
+        ],
+        "temperature": 0,
+        "max_tokens": max(1, int(max_tokens)),
+        "stream": False,
+    }
+    if disable_thinking:
+        request["extra_body"] = {
+            "chat_template_kwargs": {"enable_thinking": False},
+            "enable_thinking": False,
+            "reasoning_effort": "none",
+        }
+
+    response = client.chat.completions.create(**request)
+    content = response.choices[0].message.content
+    return str(content or "").strip()
+
+
+def _image_to_base64_png(image: Image.Image) -> str:
+    buffer = BytesIO()
+    image.convert("RGB").save(buffer, format="PNG")
+    return base64.b64encode(buffer.getvalue()).decode("ascii")
@@ -0,0 +1,60 @@
+from __future__ import annotations
+
+import os
+from pathlib import Path
+
+from PIL import Image, ImageEnhance, ImageFilter, ImageOps
+
+from .tesseract import tesseract_help_message
+
+
+def preprocess_for_ocr(image: Image.Image) -> Image.Image:
+    image = ImageOps.expand(image, border=12, fill=(0, 0, 0))
+    gray = ImageOps.grayscale(image)
+    enlarged = gray.resize((gray.width * 2, gray.height * 2), Image.Resampling.LANCZOS)
+    contrast = ImageEnhance.Contrast(enlarged).enhance(2.2)
+    sharpened = contrast.filter(ImageFilter.SHARPEN)
+    return sharpened.point(lambda pixel: 255 if pixel > 145 else 0)
+
+
+def image_to_lines(image: Image.Image, tesseract_cmd: str = "") -> list[str]:
+    import pytesseract
+    from pytesseract import TesseractNotFoundError
+
+    if tesseract_cmd:
+        pytesseract.pytesseract.tesseract_cmd = tesseract_cmd
+    tessdata = _tessdata_dir(tesseract_cmd)
+    if tessdata is not None:
+        os.environ["TESSDATA_PREFIX"] = str(tessdata)
+    config = "--psm 6 --oem 3"
+    try:
+        text = pytesseract.image_to_string(image, lang="eng", config=config)
+    except TesseractNotFoundError as exc:
+        raise RuntimeError(tesseract_help_message()) from exc
+    return [normalize_line(line) for line in text.splitlines() if normalize_line(line)]
+
+
+def normalize_line(line: str) -> str:
+    import re
+
+    line = re.sub(r"\s+", " ", line).strip()
+    line = line.replace("“", '"').replace("”", '"').replace("‘", "'").replace("’", "'")
+    return line
+
+
+def extract_text_from_lines(lines: list[str]) -> str:
+    return " ".join(lines).strip()
+
+
+def extract_ocr_text(image: Image.Image, tesseract_cmd: str = "") -> str:
+    processed = preprocess_for_ocr(image)
+    return extract_text_from_lines(image_to_lines(processed, tesseract_cmd))
+
+
+def _tessdata_dir(tesseract_cmd: str) -> Path | None:
+    if not tesseract_cmd:
+        return None
+    tessdata = Path(tesseract_cmd).resolve().parent / "tessdata"
+    if tessdata.exists():
+        return tessdata
+    return None
@@ -0,0 +1,100 @@
+from __future__ import annotations
+
+import tkinter as tk
+from collections.abc import Callable
+
+from PIL import ImageEnhance, ImageTk
+
+from .capture import capture_absolute_region
+from .config import Rect
+
+
+class RegionPicker(tk.Toplevel):
+    def __init__(
+        self,
+        parent: tk.Tk,
+        bounds: Rect,
+        on_selected: Callable[[Rect], None],
+    ) -> None:
+        super().__init__(parent)
+        self._bounds = bounds
+        self._on_selected = on_selected
+        self._start_x = 0
+        self._start_y = 0
+        self._rect_id: int | None = None
+        self._label_id: int | None = None
+
+        self.overrideredirect(True)
+        self.attributes("-topmost", True)
+        self.geometry(f"{bounds.width}x{bounds.height}{bounds.left:+d}{bounds.top:+d}")
+        self.configure(cursor="crosshair")
+
+        screenshot = capture_absolute_region(bounds.left, bounds.top, bounds.width, bounds.height)
+        dimmed = ImageEnhance.Brightness(screenshot).enhance(0.55)
+        self._image = ImageTk.PhotoImage(dimmed)
+
+        self.canvas = tk.Canvas(self, bg="#050608", highlightthickness=0, cursor="crosshair")
+        self.canvas.pack(fill=tk.BOTH, expand=True)
+        self.canvas.create_image(0, 0, image=self._image, anchor=tk.NW)
+        self.canvas.create_text(
+            18,
+            18,
+            text="Drag to select. Esc cancels.",
+            fill="#f4f7fb",
+            anchor=tk.NW,
+            font=("Segoe UI", 12, "bold"),
+        )
+        self.canvas.bind("<ButtonPress-1>", self._on_press)
+        self.canvas.bind("<B1-Motion>", self._on_drag)
+        self.canvas.bind("<ButtonRelease-1>", self._on_release)
+        self.bind("<Escape>", lambda _event: self.destroy())
+
+        self.focus_force()
+        self.grab_set()
+
+    def _on_press(self, event: tk.Event) -> None:
+        self._start_x = int(event.x)
+        self._start_y = int(event.y)
+        self._rect_id = self.canvas.create_rectangle(
+            self._start_x,
+            self._start_y,
+            self._start_x,
+            self._start_y,
+            outline="#ff365f",
+            width=3,
+        )
+
+    def _on_drag(self, event: tk.Event) -> None:
+        if self._rect_id is not None:
+            x1, x2 = sorted((self._start_x, int(event.x)))
+            y1, y2 = sorted((self._start_y, int(event.y)))
+            self.canvas.coords(self._rect_id, x1, y1, x2, y2)
+            label = f"{x2 - x1} x {y2 - y1}"
+            if self._label_id is None:
+                self._label_id = self.canvas.create_text(
+                    x1 + 8,
+                    max(12, y1 - 18),
+                    text=label,
+                    fill="#f4f7fb",
+                    anchor=tk.W,
+                    font=("Segoe UI", 10, "bold"),
+                )
+            else:
+                self.canvas.coords(self._label_id, x1 + 8, max(12, y1 - 18))
+                self.canvas.itemconfigure(self._label_id, text=label)
+
+    def _on_release(self, event: tk.Event) -> None:
+        x1, x2 = sorted((self._start_x, int(event.x)))
+        y1, y2 = sorted((self._start_y, int(event.y)))
+        self.grab_release()
+        self.destroy()
+        if x2 - x1 < 4 or y2 - y1 < 4:
+            return
+        self._on_selected(
+            Rect(
+                left=self._bounds.left + x1,
+                top=self._bounds.top + y1,
+                width=x2 - x1,
+                height=y2 - y1,
+            )
+        )
@@ -0,0 +1,9 @@
+from __future__ import annotations
+
+import sys
+from pathlib import Path
+
+
+def resource_path(relative_path: str) -> Path:
+    base = Path(getattr(sys, "_MEIPASS", Path(__file__).resolve().parents[2]))
+    return base / relative_path
@@ -0,0 +1,33 @@
+from __future__ import annotations
+
+import shutil
+from pathlib import Path
+
+from .resources import resource_path
+
+
+COMMON_TESSERACT_PATHS = (
+    Path(r"C:\Program Files\Tesseract-OCR\tesseract.exe"),
+    Path(r"C:\Program Files (x86)\Tesseract-OCR\tesseract.exe"),
+)
+
+
+def find_tesseract() -> str:
+    bundled = resource_path("tesseract/tesseract.exe")
+    if bundled.exists():
+        return str(bundled)
+    from_path = shutil.which("tesseract")
+    if from_path:
+        return from_path
+    for path in COMMON_TESSERACT_PATHS:
+        if path.exists():
+            return str(path)
+    return ""
+
+
+def tesseract_help_message() -> str:
+    return (
+        "Tesseract OCR is not installed or the executable is not configured. "
+        "Install it with `winget install UB-Mannheim.TesseractOCR`, then restart the app, "
+        "or select tesseract.exe in the GUI."
+    )
@@ -0,0 +1,576 @@
+from __future__ import annotations
+
+import collections
+import functools
+import hashlib
+import http.server
+import importlib
+import os
+import queue
+import shutil
+import subprocess
+import threading
+import time
+from pathlib import Path
+from urllib.parse import quote, urljoin
+
+import numpy as np
+import requests
+import sounddevice as sd
+
+from .resources import resource_path
+from .voices import safe_voice_slug
+
+
+VOICE_CACHE_DIR = Path.home() / ".seshat-tts" / "voices"
+
+
+class PocketTTSStreamer:
+    def __init__(
+        self,
+        voice_path: str | Path,
+        language: str = "english",
+        quantize: bool = False,
+        voice_source: str = "default",
+        default_voice: str = "alba",
+        custom_voice_name: str = "",
+        volume_gain: float = 1.0,
+    ) -> None:
+        self.voice_path = str(voice_path)
+        self.language = language
+        self.quantize = quantize
+        self.voice_source = voice_source
+        self.default_voice = default_voice
+        self.custom_voice_name = custom_voice_name
+        self.volume_gain = _clamp_volume_gain(volume_gain)
+        self._model = None
+        self._voice_state = None
+        self._lock = threading.Lock()
+        self._cancel_lock = threading.Lock()
+        self._cancel_event = threading.Event()
+        self._status_queue: queue.Queue[str] = queue.Queue()
+
+    @property
+    def status_queue(self) -> queue.Queue[str]:
+        return self._status_queue
+
+    def speak_async(self, text: str) -> None:
+        text = text.strip()
+        if not text:
+            return
+        cancel_event = self._begin_new_stream()
+        threading.Thread(target=self._speak, args=(text, cancel_event), daemon=True).start()
+
+    def preload_async(self) -> None:
+        threading.Thread(target=self._preload, daemon=True).start()
+
+    def test_async(self) -> None:
+        self.speak_async("This is a Pocket TTS test.")
+
+    def close(self) -> None:
+        self.stop()
+
+    def stop(self) -> None:
+        with self._cancel_lock:
+            self._cancel_event.set()
+
+    def _begin_new_stream(self) -> threading.Event:
+        with self._cancel_lock:
+            self._cancel_event.set()
+            self._cancel_event = threading.Event()
+            return self._cancel_event
+
+    def _preload(self) -> None:
+        with self._lock:
+            try:
+                self._load()
+            except Exception as exc:
+                self._status_queue.put(f"TTS preload error: {exc}")
+
+    def _load(self) -> None:
+        if self._model is not None and self._voice_state is not None:
+            return
+        self._status_queue.put("Loading Pocket TTS model...")
+        try:
+            pocket_tts = importlib.import_module("pocket_tts")
+            tts_model = getattr(pocket_tts, "TTSModel")
+        except (ImportError, OSError) as exc:
+            raise RuntimeError(
+                "Pocket TTS failed to load through the in-process Python API. "
+                "Use the uvx-server backend, especially from the bundled EXE."
+            ) from exc
+
+        try:
+            self._model = tts_model.load_model(language=self.language, quantize=self.quantize)
+        except OSError as exc:
+            raise RuntimeError(
+                "Pocket TTS/Torch DLL initialization failed in the in-process Python API. "
+                "Use the uvx-server backend instead."
+            ) from exc
+        voice = self.default_voice if self.voice_source == "default" else self._custom_voice_path()
+        self._status_queue.put(f"Loading voice: {voice}")
+        self._voice_state = self._model.get_state_for_audio_prompt(voice)
+        self._status_queue.put("Pocket TTS ready.")
+
+    def _custom_voice_path(self) -> str:
+        if not self.voice_path.strip():
+            raise ValueError("Select a WAV or MP3 file, or change Voice Source to default.")
+        return str(_prepared_audio_prompt_path(self.voice_path, self.language, self._status_queue))
+
+    def _speak(self, text: str, cancel_event: threading.Event) -> None:
+        with self._lock:
+            try:
+                self._load()
+                if cancel_event.is_set():
+                    self._status_queue.put("Stopped previous TTS stream.")
+                    return
+                assert self._model is not None
+                assert self._voice_state is not None
+                sample_rate = int(self._model.sample_rate)
+                self._status_queue.put("Speaking OCR text...")
+                with sd.OutputStream(samplerate=sample_rate, channels=1, dtype="float32") as stream:
+                    for chunk in self._model.generate_audio_stream(self._voice_state, text):
+                        if cancel_event.is_set():
+                            self._status_queue.put("Stopped previous TTS stream.")
+                            return
+                        audio = chunk.detach().cpu().numpy()
+                        audio = np.asarray(audio, dtype=np.float32).reshape(-1)
+                        if audio.size:
+                            stream.write(_apply_volume_gain(audio, self.volume_gain))
+                self._status_queue.put("Done.")
+            except Exception as exc:
+                self._status_queue.put(f"TTS error: {exc}")
+
+
+class UvxPocketTTSServer:
+    def __init__(
+        self,
+        voice_path: str | Path,
+        language: str = "english",
+        quantize: bool = False,
+        host: str = "localhost",
+        port: int = 8000,
+        voice_source: str = "default",
+        default_voice: str = "alba",
+        custom_voice_name: str = "",
+        volume_gain: float = 1.0,
+    ) -> None:
+        self.voice_path = str(voice_path)
+        self.language = language
+        self.quantize = quantize
+        self.host = host
+        self.port = port
+        self.voice_source = voice_source
+        self.default_voice = default_voice
+        self.custom_voice_name = custom_voice_name
+        self.volume_gain = _clamp_volume_gain(volume_gain)
+        self._process: subprocess.Popen[str] | None = None
+        self._lock = threading.Lock()
+        self._speak_lock = threading.Lock()
+        self._cancel_lock = threading.Lock()
+        self._cancel_event = threading.Event()
+        self._active_response: requests.Response | None = None
+        self._server_output: collections.deque[str] = collections.deque(maxlen=80)
+        self._status_queue: queue.Queue[str] = queue.Queue()
+
+    @property
+    def status_queue(self) -> queue.Queue[str]:
+        return self._status_queue
+
+    @property
+    def base_url(self) -> str:
+        return f"http://{self.host}:{self.port}/"
+
+    def preload_async(self) -> None:
+        threading.Thread(target=self._ensure_server, daemon=True).start()
+
+    def speak_async(self, text: str) -> None:
+        text = text.strip()
+        if not text:
+            return
+        cancel_event = self._begin_new_stream()
+        threading.Thread(target=self._speak, args=(text, cancel_event), daemon=True).start()
+
+    def test_async(self) -> None:
+        self.speak_async("This is a Pocket TTS test.")
+
+    def close(self) -> None:
+        self.stop()
+        if self._process and self._process.poll() is None:
+            self._process.terminate()
+
+    def stop(self) -> None:
+        with self._cancel_lock:
+            self._cancel_event.set()
+            if self._active_response is not None:
+                self._active_response.close()
+
+    def _begin_new_stream(self) -> threading.Event:
+        with self._cancel_lock:
+            self._cancel_event.set()
+            if self._active_response is not None:
+                self._active_response.close()
+            self._cancel_event = threading.Event()
+            return self._cancel_event
+
+    def _is_healthy(self) -> bool:
+        try:
+            response = requests.get(urljoin(self.base_url, "health"), timeout=2)
+            return response.ok
+        except requests.RequestException:
+            return False
+
+    def _ensure_server(self) -> None:
+        with self._lock:
+            if self._is_healthy():
+                self._status_queue.put("Pocket TTS server ready.")
+                return
+            if self._process is None or self._process.poll() is not None:
+                uvx = _find_uvx()
+                command = [
+                    str(uvx),
+                    "pocket-tts",
+                    "serve",
+                    "--host",
+                    self.host,
+                    "--port",
+                    str(self.port),
+                    "--language",
+                    self.language,
+                ]
+                if self.quantize:
+                    command.append("--quantize")
+                self._server_output.clear()
+                self._status_queue.put(f"Starting Pocket TTS server with {uvx}...")
+                self._process = subprocess.Popen(
+                    command,
+                    stdout=subprocess.PIPE,
+                    stderr=subprocess.STDOUT,
+                    text=True,
+                    encoding="utf-8",
+                    errors="replace",
+                    env=_clean_subprocess_env(),
+                    cwd=str(Path.home()),
+                    creationflags=_subprocess_creationflags(),
+                )
+                threading.Thread(target=self._read_server_output, daemon=True).start()
+            deadline = time.monotonic() + 900
+            while time.monotonic() < deadline:
+                if self._is_healthy():
+                    self._status_queue.put("Pocket TTS server ready.")
+                    return
+                if self._process and self._process.poll() is not None:
+                    output = self._server_output_tail()
+                    detail = f"\n{output}" if output else " No server output was captured."
+                    raise RuntimeError(f"Pocket TTS server exited with code {self._process.returncode}.{detail}")
+                time.sleep(1)
+            raise TimeoutError("Pocket TTS server did not become ready before timeout.")
+
+    def _read_server_output(self) -> None:
+        process = self._process
+        if process is None or process.stdout is None:
+            return
+        try:
+            for line in process.stdout:
+                line = line.strip()
+                if line:
+                    self._server_output.append(line)
+        except Exception as exc:
+            self._server_output.append(f"Failed to read server output: {exc}")
+
+    def _server_output_tail(self) -> str:
+        if not self._server_output:
+            return ""
+        return "\n".join(list(self._server_output)[-12:])
+
+    def _speak(self, text: str, cancel_event: threading.Event) -> None:
+        with self._speak_lock:
+            if cancel_event.is_set():
+                self._status_queue.put("Stopped previous TTS stream.")
+                return
+            try:
+                self._ensure_server()
+                if cancel_event.is_set():
+                    self._status_queue.put("Stopped previous TTS stream.")
+                    return
+                self._status_queue.put("Requesting Pocket TTS audio...")
+                if self.voice_source == "default":
+                    response = requests.post(
+                        urljoin(self.base_url, "tts"),
+                        data={"text": text, "voice_url": self.default_voice},
+                        stream=True,
+                        timeout=900,
+                    )
+                else:
+                    voice_url = self._custom_voice_url()
+                    response = requests.post(
+                        urljoin(self.base_url, "tts"),
+                        data={"text": text, "voice_url": voice_url},
+                        stream=True,
+                        timeout=900,
+                    )
+                with self._cancel_lock:
+                    self._active_response = response
+                response.raise_for_status()
+                self._play_streaming_wav(response, cancel_event)
+                if not cancel_event.is_set():
+                    self._status_queue.put("Done.")
+            except requests.RequestException as exc:
+                if cancel_event.is_set():
+                    self._status_queue.put("Stopped previous TTS stream.")
+                else:
+                    self._status_queue.put(f"TTS error: {exc}")
+            except Exception as exc:
+                self._status_queue.put(f"TTS error: {exc}")
+            finally:
+                with self._cancel_lock:
+                    self._active_response = None
+
+    def _custom_voice_path(self) -> str:
+        if not self.voice_path.strip():
+            raise ValueError("Select a WAV or MP3 file, or change Voice Source to default.")
+        return self.voice_path
+
+    def _custom_voice_url(self) -> str:
+        voice_state = _cached_voice_state_path(
+            self._custom_voice_path(),
+            self.language,
+            self._status_queue,
+            self.custom_voice_name,
+        )
+        return _voice_state_server.url_for(voice_state)
+
+    def _play_streaming_wav(self, response: requests.Response, cancel_event: threading.Event) -> None:
+        buffer = bytearray()
+        stream: sd.OutputStream | None = None
+        sample_width = 0
+        channels = 0
+        try:
+            for chunk in response.iter_content(chunk_size=16384):
+                if cancel_event.is_set():
+                    response.close()
+                    self._status_queue.put("Stopped previous TTS stream.")
+                    return
+                if not chunk:
+                    continue
+                buffer.extend(chunk)
+                if stream is None:
+                    header_end = _find_wav_data_offset(buffer)
+                    if header_end is None:
+                        continue
+                    channels, sample_rate, sample_width = _read_wav_format(buffer)
+                    stream = sd.OutputStream(samplerate=sample_rate, channels=channels, dtype="float32")
+                    stream.start()
+                    del buffer[:header_end]
+                    self._status_queue.put("Streaming Pocket TTS audio...")
+                frame_size = sample_width * channels
+                usable = len(buffer) - (len(buffer) % frame_size)
+                if usable <= 0:
+                    continue
+                pcm = bytes(buffer[:usable])
+                del buffer[:usable]
+                audio = _pcm_to_float32(pcm, sample_width, channels)
+                if audio.size:
+                    stream.write(_apply_volume_gain(audio, self.volume_gain))
+        finally:
+            if stream is not None:
+                stream.stop()
+                stream.close()
+
+
+class _QuietStaticFileHandler(http.server.SimpleHTTPRequestHandler):
+    def log_message(self, _format: str, *args: object) -> None:
+        return
+
+
+class _VoiceStateServer:
+    def __init__(self) -> None:
+        self._lock = threading.Lock()
+        self._server: http.server.ThreadingHTTPServer | None = None
+        self._thread: threading.Thread | None = None
+
+    def url_for(self, path: Path) -> str:
+        with self._lock:
+            VOICE_CACHE_DIR.mkdir(parents=True, exist_ok=True)
+            if self._server is None:
+                handler = functools.partial(_QuietStaticFileHandler, directory=str(VOICE_CACHE_DIR))
+                self._server = http.server.ThreadingHTTPServer(("127.0.0.1", 0), handler)
+                self._thread = threading.Thread(target=self._server.serve_forever, daemon=True)
+                self._thread.start()
+            port = self._server.server_address[1]
+        return f"http://127.0.0.1:{port}/{quote(path.name)}"
+
+
+_voice_state_server = _VoiceStateServer()
+
+
+def _cached_voice_state_path(
+    source_path: str,
+    language: str,
+    status_queue: queue.Queue[str],
+    voice_name: str = "",
+) -> Path:
+    source = Path(source_path)
+    if not source.exists():
+        raise FileNotFoundError(f"Voice file not found: {source}")
+    VOICE_CACHE_DIR.mkdir(parents=True, exist_ok=True)
+    stat = source.stat()
+    digest = hashlib.sha256(
+        f"{source.resolve()}|{stat.st_mtime_ns}|{stat.st_size}|{language}".encode("utf-8")
+    ).hexdigest()[:24]
+    prefix = safe_voice_slug(voice_name) if voice_name.strip() else source.stem
+    target = VOICE_CACHE_DIR / f"{safe_voice_slug(prefix)}-{digest}.safetensors"
+    if source.suffix.casefold() == ".safetensors":
+        if not target.exists():
+            shutil.copy2(source, target)
+        status_queue.put("Using cached custom voice state.")
+        return target
+    if target.exists():
+        status_queue.put("Using cached custom voice state.")
+        return target
+
+    prompt_source = _prepared_audio_prompt_path(source, language, status_queue, digest)
+    status_queue.put("Exporting custom voice cache; first run can take a while.")
+    command = [
+        str(_find_uvx()),
+        "pocket-tts",
+        "export-voice",
+        str(prompt_source),
+        str(target),
+        "--language",
+        language,
+        "--quiet",
+    ]
+    subprocess.run(command, check=True, env=_clean_subprocess_env(), creationflags=_subprocess_creationflags())
+    status_queue.put("Custom voice cache ready.")
+    return target
+
+
+def _prepared_audio_prompt_path(
+    source_path: str | Path,
+    language: str,
+    status_queue: queue.Queue[str],
+    digest: str | None = None,
+) -> Path:
+    source = Path(source_path)
+    if source.suffix.casefold() != ".mp3":
+        return source
+    VOICE_CACHE_DIR.mkdir(parents=True, exist_ok=True)
+    if digest is None:
+        stat = source.stat()
+        digest = hashlib.sha256(
+            f"{source.resolve()}|{stat.st_mtime_ns}|{stat.st_size}|{language}".encode("utf-8")
+        ).hexdigest()[:24]
+    target = VOICE_CACHE_DIR / f"{safe_voice_slug(source.stem)}-{digest}.wav"
+    if target.exists():
+        status_queue.put("Using cached WAV conversion for MP3 voice.")
+        return target
+    status_queue.put("Converting MP3 voice reference to WAV...")
+    _convert_mp3_to_wav(source, target)
+    status_queue.put("MP3 voice conversion ready.")
+    return target
+
+
+def _convert_mp3_to_wav(source: Path, target: Path) -> None:
+    try:
+        import imageio_ffmpeg
+    except ImportError as exc:
+        raise RuntimeError("MP3 custom voices require imageio-ffmpeg. Reinstall Seshat TTS dependencies.") from exc
+
+    command = [
+        imageio_ffmpeg.get_ffmpeg_exe(),
+        "-hide_banner",
+        "-loglevel",
+        "error",
+        "-y",
+        "-i",
+        str(source),
+        "-vn",
+        "-ac",
+        "1",
+        "-ar",
+        "24000",
+        "-sample_fmt",
+        "s16",
+        str(target),
+    ]
+    subprocess.run(command, check=True, env=_clean_subprocess_env(), creationflags=_subprocess_creationflags())
+
+
+def _find_wav_data_offset(data: bytearray) -> int | None:
+    marker = data.find(b"data")
+    if marker < 0 or len(data) < marker + 8:
+        return None
+    return marker + 8
+
+
+def _read_wav_format(data: bytearray) -> tuple[int, int, int]:
+    if len(data) < 36 or data[:4] != b"RIFF" or data[8:12] != b"WAVE":
+        raise ValueError("Response is not a WAV stream.")
+    fmt = data.find(b"fmt ")
+    if fmt < 0 or len(data) < fmt + 24:
+        raise ValueError("WAV stream is missing fmt chunk.")
+    channels = int.from_bytes(data[fmt + 10 : fmt + 12], "little")
+    sample_rate = int.from_bytes(data[fmt + 12 : fmt + 16], "little")
+    bits_per_sample = int.from_bytes(data[fmt + 22 : fmt + 24], "little")
+    return channels, sample_rate, bits_per_sample // 8
+
+
+def _pcm_to_float32(pcm: bytes, sample_width: int, channels: int) -> np.ndarray:
+    if sample_width == 2:
+        audio = np.frombuffer(pcm, dtype=np.int16).astype(np.float32) / 32768.0
+    elif sample_width == 4:
+        audio = np.frombuffer(pcm, dtype=np.int32).astype(np.float32) / 2147483648.0
+    else:
+        raise ValueError(f"Unsupported WAV sample width: {sample_width}")
+    if channels > 1:
+        return audio.reshape(-1, channels)
+    return audio.reshape(-1, 1)
+
+
+def _clamp_volume_gain(value: float) -> float:
+    return max(0.0, min(float(value), 3.0))
+
+
+def _apply_volume_gain(audio: np.ndarray, volume_gain: float) -> np.ndarray:
+    gain = _clamp_volume_gain(volume_gain)
+    if gain == 1.0:
+        return audio
+    return np.clip(audio * gain, -1.0, 1.0).astype(np.float32, copy=False)
+
+
+def _find_uvx() -> Path:
+    bundled = resource_path("tools/uvx.exe")
+    if bundled.exists():
+        return bundled
+    found = shutil.which("uvx")
+    if found:
+        return Path(found)
+    candidates = [
+        Path.home() / ".local" / "bin" / "uvx.exe",
+        Path.home() / ".cargo" / "bin" / "uvx.exe",
+    ]
+    for candidate in candidates:
+        if candidate.exists():
+            return candidate
+    raise FileNotFoundError("uvx.exe was not found on PATH. Install uv or add uvx.exe to PATH.")
+
+
+def _clean_subprocess_env() -> dict[str, str]:
+    env = os.environ.copy()
+    for key in list(env):
+        if key.startswith("_PYI") or key.startswith("PYINSTALLER"):
+            env.pop(key, None)
+    env.pop("PYTHONHOME", None)
+    env.pop("PYTHONPATH", None)
+
+    user_bin = Path.home() / ".local" / "bin"
+    if user_bin.exists():
+        env["PATH"] = str(user_bin) + os.pathsep + env.get("PATH", "")
+    return env
+
+
+def _subprocess_creationflags() -> int:
+    if os.name != "nt":
+        return 0
+    return int(getattr(subprocess, "CREATE_NO_WINDOW", 0))
@@ -0,0 +1,55 @@
+from __future__ import annotations
+
+import json
+import re
+from dataclasses import asdict, dataclass
+from pathlib import Path
+from typing import Any
+
+from .config import APP_DIR
+
+
+VOICE_PROFILES_PATH = APP_DIR / "voice_profiles.json"
+
+
+@dataclass(slots=True)
+class VoiceProfile:
+    name: str
+    path: str
+
+
+def safe_voice_slug(name: str) -> str:
+    slug = re.sub(r"[^a-zA-Z0-9._-]+", "-", name.strip()).strip("-._")
+    return slug or "custom-voice"
+
+
+def load_voice_profiles(path: Path = VOICE_PROFILES_PATH) -> list[VoiceProfile]:
+    if not path.exists():
+        return []
+    data = json.loads(path.read_text(encoding="utf-8"))
+    profiles: list[VoiceProfile] = []
+    for item in data if isinstance(data, list) else []:
+        if not isinstance(item, dict):
+            continue
+        name = str(item.get("name", "")).strip()
+        voice_path = str(item.get("path", "")).strip()
+        if name and voice_path:
+            profiles.append(VoiceProfile(name=name, path=voice_path))
+    return profiles
+
+
+def save_voice_profiles(profiles: list[VoiceProfile], path: Path = VOICE_PROFILES_PATH) -> None:
+    path.parent.mkdir(parents=True, exist_ok=True)
+    path.write_text(json.dumps([asdict(profile) for profile in profiles], indent=2), encoding="utf-8")
+
+
+def upsert_voice_profile(profile: VoiceProfile, path: Path = VOICE_PROFILES_PATH) -> list[VoiceProfile]:
+    profiles = [item for item in load_voice_profiles(path) if item.name != profile.name]
+    profiles.append(profile)
+    profiles.sort(key=lambda item: item.name.casefold())
+    save_voice_profiles(profiles, path)
+    return profiles
+
+
+def voice_profile_by_name(name: str, profiles: list[VoiceProfile]) -> VoiceProfile | None:
+    return next((profile for profile in profiles if profile.name == name), None)
@@ -0,0 +1,70 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+
+import win32gui
+
+
+@dataclass(frozen=True, slots=True)
+class WindowInfo:
+    hwnd: int
+    title: str
+    left: int
+    top: int
+    right: int
+    bottom: int
+
+    @property
+    def width(self) -> int:
+        return self.right - self.left
+
+    @property
+    def height(self) -> int:
+        return self.bottom - self.top
+
+    @property
+    def label(self) -> str:
+        return f"{self.title} [{self.width}x{self.height} at {self.left},{self.top}]"
+
+
+def _is_candidate(hwnd: int) -> bool:
+    if not win32gui.IsWindowVisible(hwnd):
+        return False
+    title = win32gui.GetWindowText(hwnd).strip()
+    if not title:
+        return False
+    left, top, right, bottom = win32gui.GetWindowRect(hwnd)
+    return (right - left) > 50 and (bottom - top) > 50
+
+
+def list_visible_windows() -> list[WindowInfo]:
+    windows: list[WindowInfo] = []
+
+    def callback(hwnd: int, _extra: object) -> None:
+        if _is_candidate(hwnd):
+            left, top, right, bottom = win32gui.GetWindowRect(hwnd)
+            windows.append(
+                WindowInfo(
+                    hwnd=hwnd,
+                    title=win32gui.GetWindowText(hwnd).strip(),
+                    left=left,
+                    top=top,
+                    right=right,
+                    bottom=bottom,
+                )
+            )
+
+    win32gui.EnumWindows(callback, None)
+    windows.sort(key=lambda item: item.title.casefold())
+    return windows
+
+
+def find_window_by_title(title: str) -> WindowInfo | None:
+    title = title.strip()
+    if not title:
+        return None
+    for window in list_visible_windows():
+        if window.title == title:
+            return window
+    needle = title.casefold()
+    return next((window for window in list_visible_windows() if needle in window.title.casefold()), None)
@@ -0,0 +1,80 @@
+from pathlib import Path
+import json
+
+from seshat_tts.config import AppConfig, Rect, load_config, save_config
+
+
+def test_config_roundtrip(tmp_path: Path) -> None:
+    path = tmp_path / "config.json"
+    config = AppConfig(
+        monitor_index=2,
+        hotkey="ctrl+shift+d",
+        capture_region_hotkey="ctrl+shift+r",
+        stop_hotkey="ctrl+shift+s",
+        dialogue_rect=Rect(left=1, top=2, width=3, height=4),
+        tesseract_cmd="C:/Tesseract/tesseract.exe",
+        voice_source="custom-wav",
+        default_voice="alba",
+        voice_path="voice.mp3",
+        language="english",
+        quantize_tts=True,
+        volume_gain=1.75,
+        last_text="hello",
+    )
+
+    save_config(config, path)
+
+    assert load_config(path) == config
+
+
+def test_load_config_removes_old_region_metadata_from_last_text(tmp_path: Path) -> None:
+    path = tmp_path / "config.json"
+    path.write_text(
+        json.dumps(
+            {
+                "last_text": "Capture region: 85,51 628x84\nText region: 85,44 633x77\n\nA line to read."
+            }
+        ),
+        encoding="utf-8",
+    )
+
+    assert load_config(path).last_text == "A line to read."
+
+
+def test_load_config_reads_llm_settings(tmp_path: Path) -> None:
+    path = tmp_path / "config.json"
+    path.write_text(
+        json.dumps(
+            {
+                "llm_enabled": True,
+                "llm_base_url": "http://127.0.0.1:11434/v1",
+                "llm_api_key": "local",
+                "llm_model": "unsloth-local",
+                "llm_timeout": 1.5,
+                "llm_max_tokens": 64,
+                "llm_disable_thinking": False,
+                "llm_image_extraction": True,
+                "llm_system_prompt": "clean this",
+            }
+        ),
+        encoding="utf-8",
+    )
+
+    config = load_config(path)
+
+    assert config.llm_enabled is True
+    assert config.llm_base_url == "http://127.0.0.1:11434/v1"
+    assert config.llm_api_key == "local"
+    assert config.llm_model == "unsloth-local"
+    assert config.llm_timeout == 1.5
+    assert config.llm_max_tokens == 64
+    assert config.llm_disable_thinking is False
+    assert config.llm_image_extraction is True
+    assert config.llm_system_prompt == "clean this"
+
+
+def test_load_config_forces_english_language(tmp_path: Path) -> None:
+    path = tmp_path / "config.json"
+    path.write_text(json.dumps({"language": "french"}), encoding="utf-8")
+
+    assert load_config(path).language == "english"
@@ -0,0 +1,118 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+
+from PIL import Image
+
+from seshat_tts.llm import process_image_with_llm, process_text_with_llm
+
+
+@dataclass
+class _Message:
+    content: str
+
+
+@dataclass
+class _Choice:
+    message: _Message
+
+
+@dataclass
+class _Response:
+    choices: list[_Choice]
+
+
+class _Completions:
+    def __init__(self) -> None:
+        self.kwargs: dict[str, object] | None = None
+
+    def create(self, **kwargs: object) -> _Response:
+        self.kwargs = kwargs
+        return _Response([_Choice(_Message("Cleaned text."))])
+
+
+class _Client:
+    def __init__(self) -> None:
+        self.chat = type("Chat", (), {"completions": _Completions()})()
+
+
+def test_llm_disabled_returns_original_text() -> None:
+    assert (
+        process_text_with_llm(
+            " OCR text ",
+            enabled=False,
+            base_url="http://127.0.0.1:8000/v1",
+            api_key="local",
+            model="unsloth",
+            system_prompt="clean",
+        )
+        == "OCR text"
+    )
+
+
+def test_llm_enabled_uses_openai_compatible_chat_client() -> None:
+    client = _Client()
+
+    result = process_text_with_llm(
+        "OCR text",
+        enabled=True,
+        base_url="http://127.0.0.1:8000/v1",
+        api_key="local",
+        model="unsloth-model",
+        system_prompt="clean",
+        timeout=1,
+        max_tokens=32,
+        client=client,
+    )
+
+    assert result == "Cleaned text."
+    assert client.chat.completions.kwargs is not None
+    assert client.chat.completions.kwargs["model"] == "unsloth-model"
+    assert client.chat.completions.kwargs["temperature"] == 0
+    assert client.chat.completions.kwargs["extra_body"] == {
+        "chat_template_kwargs": {"enable_thinking": False},
+        "enable_thinking": False,
+        "reasoning_effort": "none",
+    }
+
+
+def test_llm_can_send_without_disable_thinking_metadata() -> None:
+    client = _Client()
+
+    process_text_with_llm(
+        "OCR text",
+        enabled=True,
+        base_url="http://127.0.0.1:8000/v1",
+        api_key="local",
+        model="unsloth-model",
+        system_prompt="clean",
+        disable_thinking=False,
+        client=client,
+    )
+
+    assert client.chat.completions.kwargs is not None
+    assert "extra_body" not in client.chat.completions.kwargs
+
+
+def test_llm_can_extract_text_from_image_region() -> None:
+    client = _Client()
+    image = Image.new("RGB", (16, 8), "black")
+
+    result = process_image_with_llm(
+        image,
+        base_url="http://127.0.0.1:8000/v1",
+        api_key="local",
+        model="vision-model",
+        timeout=1,
+        max_tokens=64,
+        client=client,
+    )
+
+    assert result == "Cleaned text."
+    assert client.chat.completions.kwargs is not None
+    assert client.chat.completions.kwargs["model"] == "vision-model"
+    messages = client.chat.completions.kwargs["messages"]
+    user_content = messages[1]["content"]
+    assert user_content[0]["type"] == "text"
+    assert user_content[1]["type"] == "image_url"
+    assert user_content[1]["image_url"]["url"].startswith("data:image/png;base64,")
@@ -0,0 +1,22 @@
+from seshat_tts.ocr import extract_text_from_lines
+
+
+def test_selected_text_does_not_skip_first_line() -> None:
+    lines = ["A large group of humanoids came from the foothills", "and headed north not long ago."]
+
+    assert (
+        extract_text_from_lines(lines)
+        == "A large group of humanoids came from the foothills and headed north not long ago."
+    )
+
+
+def test_selected_text_includes_choice_marker_text_when_inside_region() -> None:
+    lines = ["Line to read.", "|. Continue"]
+
+    assert extract_text_from_lines(lines) == "Line to read. |. Continue"
+
+
+def test_selected_text_includes_pipe_marker_without_dot_when_inside_region() -> None:
+    lines = ["Line to read.", "| Continue"]
+
+    assert extract_text_from_lines(lines) == "Line to read. | Continue"
@@ -0,0 +1,34 @@
+from pathlib import Path
+import queue
+
+from seshat_tts import tts
+
+
+def test_prepared_audio_prompt_leaves_wav_unchanged(tmp_path: Path) -> None:
+    source = tmp_path / "voice.wav"
+    source.write_bytes(b"wav")
+
+    assert tts._prepared_audio_prompt_path(source, "english", queue.Queue()) == source
+
+
+def test_prepared_audio_prompt_converts_mp3_once(tmp_path: Path, monkeypatch) -> None:
+    source = tmp_path / "voice.mp3"
+    source.write_bytes(b"mp3")
+    cache = tmp_path / "cache"
+    calls: list[tuple[Path, Path]] = []
+
+    monkeypatch.setattr(tts, "VOICE_CACHE_DIR", cache)
+
+    def fake_convert(input_path: Path, output_path: Path) -> None:
+        calls.append((input_path, output_path))
+        output_path.write_bytes(b"wav")
+
+    monkeypatch.setattr(tts, "_convert_mp3_to_wav", fake_convert)
+
+    first = tts._prepared_audio_prompt_path(source, "english", queue.Queue())
+    second = tts._prepared_audio_prompt_path(source, "english", queue.Queue())
+
+    assert first == second
+    assert first.suffix == ".wav"
+    assert first.exists()
+    assert calls == [(source, first)]