commit 75fc1afa53c922db8cc2006300a2e56c43160064 Author: cbartos Date: Fri May 22 05:54:01 2026 -0400 seshat-tts diff --git a/.editorconfig b/.editorconfig new file mode 100644 index 0000000..a8092d9 --- /dev/null +++ b/.editorconfig @@ -0,0 +1,13 @@ +root = true + +[*] +charset = utf-8 +end_of_line = crlf +insert_final_newline = true +trim_trailing_whitespace = true + +[*.{md,yml,yaml,toml,py}] +end_of_line = lf + +[*.ps1] +end_of_line = crlf diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml new file mode 100644 index 0000000..de4d556 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug_report.yml @@ -0,0 +1,64 @@ +name: Bug report +description: Report a reproducible Seshat TTS problem +title: "[Bug]: " +labels: ["bug", "needs-triage"] +body: + - type: textarea + id: summary + attributes: + label: Summary + description: What happened? + validations: + required: true + - type: textarea + id: steps + attributes: + label: Steps to reproduce + description: List the exact steps. + placeholder: | + 1. Open app + 2. Select window + 3. Press hotkey + validations: + required: true + - type: textarea + id: expected + attributes: + label: Expected behavior + validations: + required: true + - type: textarea + id: actual + attributes: + label: Actual behavior + validations: + required: true + - type: input + id: version + attributes: + label: App version or commit + - type: input + id: windows + attributes: + label: Windows version + - type: dropdown + id: capture-mode + attributes: + label: Capture mode + options: + - monitor + - window + - unknown + - type: dropdown + id: voice-mode + attributes: + label: Voice mode + options: + - default + - custom-wav + - unknown + - type: textarea + id: logs + attributes: + label: Error text or logs + description: Remove secrets before pasting. diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml new file mode 100644 index 0000000..6e2937d --- /dev/null +++ b/.github/ISSUE_TEMPLATE/config.yml @@ -0,0 +1,8 @@ +blank_issues_enabled: false +contact_links: + - name: Support + url: mailto:support@scriptriva.com + about: Contact Scriptriva Inc. for support inquiries. + - name: Security vulnerability + url: mailto:support@scriptriva.com + about: Please report security issues privately. diff --git a/.github/ISSUE_TEMPLATE/feature_request.yml b/.github/ISSUE_TEMPLATE/feature_request.yml new file mode 100644 index 0000000..cdce2a1 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/feature_request.yml @@ -0,0 +1,28 @@ +name: Feature request +description: Suggest a focused improvement +title: "[Feature]: " +labels: ["enhancement", "needs-triage"] +body: + - type: textarea + id: problem + attributes: + label: Problem + description: What user problem does this solve? + validations: + required: true + - type: textarea + id: proposal + attributes: + label: Proposed solution + description: What should change? + validations: + required: true + - type: textarea + id: alternatives + attributes: + label: Alternatives considered + - type: textarea + id: scope + attributes: + label: Scope and risks + description: What files or behavior might be affected? diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md new file mode 100644 index 0000000..2f16dc0 --- /dev/null +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -0,0 +1,16 @@ +## Summary + +Describe what changed and why. + +## Testing + +- [ ] `python -m pytest -q` +- [ ] GUI smoke tested, if UI behavior changed +- [ ] Build tested, if packaging changed + +## Checklist + +- [ ] No secrets, voice samples, local configs, or generated binaries committed +- [ ] Documentation updated if behavior changed +- [ ] Third-party notices preserved +- [ ] Change is scoped and maintainable diff --git a/.github/dependabot.yml b/.github/dependabot.yml new file mode 100644 index 0000000..a5847a2 --- /dev/null +++ b/.github/dependabot.yml @@ -0,0 +1,11 @@ +version: 2 +updates: + - package-ecosystem: "pip" + directory: "/" + schedule: + interval: "weekly" + open-pull-requests-limit: 5 + - package-ecosystem: "github-actions" + directory: "/" + schedule: + interval: "weekly" diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..189d514 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,36 @@ +name: CI + +on: + pull_request: + push: + branches: ["main"] + +permissions: + contents: read + +jobs: + tests: + name: Tests + runs-on: windows-latest + strategy: + fail-fast: false + matrix: + python-version: ["3.10", "3.13"] + steps: + - name: Checkout + uses: actions/checkout@v4 + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + cache: pip + - name: Install + run: | + python -m pip install --upgrade pip + python -m pip install -e ".[test]" + - name: Compile + run: python -m compileall src tests scripts\pyinstaller_entry.py + - name: Test + env: + PYTHONPATH: src + run: python -m pytest -q diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..9da0010 --- /dev/null +++ b/.gitignore @@ -0,0 +1,13 @@ +build/* +build/launcher/* +dist/* +scripts/build_output.txt +*.egg-info/ +__pycache__/* +.pytest_cache/* +.venv/ +*.pyc +tts_output.wav +*.log +src\seshat_tts\__pycache__/* +api_key.txt diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md new file mode 100644 index 0000000..2243cac --- /dev/null +++ b/CODE_OF_CONDUCT.md @@ -0,0 +1,23 @@ +# Code of Conduct + +Seshat TTS follows the [Contributor Covenant Code of Conduct](https://www.contributor-covenant.org/version/2/1/code_of_conduct/). + +## Expected Behavior + +- Be respectful, direct, and constructive. +- Assume good intent while still holding technical claims to evidence. +- Keep discussion focused on the project and its users. +- Report security or abuse issues through the security process instead of public threads. + +## Unacceptable Behavior + +- Harassment, threats, discriminatory language, or targeted personal attacks. +- Publishing private information or credentials. +- Using the project to support abuse, fraud, malware, unauthorized surveillance, or unauthorized voice cloning. +- Repeatedly derailing technical discussion after maintainers ask to refocus. + +## Enforcement + +Scriptriva Inc. maintainers may moderate, edit, hide, lock, or remove issues, comments, pull requests, and releases that violate this code. Serious or repeated violations may result in a ban from project spaces. + +For conduct concerns, contact support@scriptriva.com. diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..8b5b602 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,39 @@ +# Contributing + +Thank you for helping improve Seshat TTS. This project is maintained by Scriptriva Inc. and welcomes focused community contributions. + +## Development Setup + +```powershell +python -m venv .venv +.\.venv\Scripts\Activate.ps1 +python -m pip install -e ".[test]" +$env:PYTHONPATH='src' +python -m pytest -q +``` + +## Contribution Areas + +- OCR accuracy and preprocessing. +- Window capture reliability. +- TTS stream cancellation and playback. +- Local OpenAI-compatible LLM cleanup. +- Packaging and documentation. +- Accessibility and usability. + +## Pull Request Expectations + +- Keep changes scoped and explain user-visible behavior. +- Add or update tests for behavior changes. +- Do not commit secrets, voice samples, generated voice caches, build outputs, or local config files. +- Preserve third-party notices and license files. +- Follow existing code style and avoid unrelated refactors. +- Run `python -m pytest -q` before opening a pull request. + +## Licensing + +By contributing, you agree that your contribution may be used under the project license and under any future license chosen by Scriptriva Inc. for this project. You also confirm that you have the right to submit the contribution. + +## Security + +Do not open public issues for vulnerabilities. Follow [SECURITY.md](SECURITY.md). diff --git a/GOVERNANCE.md b/GOVERNANCE.md new file mode 100644 index 0000000..fc115cb --- /dev/null +++ b/GOVERNANCE.md @@ -0,0 +1,30 @@ +# Governance + +Seshat TTS is maintained by Scriptriva Inc. + +## Maintainer Authority + +Scriptriva Inc. maintainers decide: + +- project roadmap and priorities +- releases and distribution +- licensing and branding +- security response +- pull request acceptance +- issue moderation + +## Community Contributions + +Community contributions are welcome when they align with the project goals and license. Maintainers may request changes, defer work, close inactive issues, or reject changes that increase maintenance risk. + +## Decision Principles + +- User safety and privacy come first. +- Runtime reliability matters more than novelty. +- Captured text, voice files, API keys, and generated caches are sensitive user data. +- Third-party licenses and notices must remain clear. +- UI changes should keep the app understandable for non-developer users. + +## Maintainer Contact + +For support, licensing, governance, or security routing: support@scriptriva.com diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..a64456d --- /dev/null +++ b/LICENSE @@ -0,0 +1,286 @@ +Scriptriva Public Source License 1.0 + +Copyright (c) 2026 Scriptriva Inc. + +This license may be reused for other projects by replacing the copyright owner, +year, project metadata, support contact, and any project-specific notices. The +terms below use universal language so the license can govern any software, +documentation, assets, build scripts, or related materials distributed under it. + +This is a public source license. It allows commercial use subject to the terms +below. It is not the MIT License, and it is not an Open Source Initiative +approved open source license because it contains attribution, trademark, +distribution, safety, and same-functionality product restrictions. + +1. Definitions + +"Licensor" means the person or entity identified in the copyright notice for the +Licensed Work. + +"Licensed Work" means the source code, object code, documentation, graphical +interface, build scripts, first-party assets, examples, configuration, and other +materials distributed by Licensor under this license, except for third-party +materials governed by separate licenses or notices. + +"You" means any individual, company, organization, or other legal entity that +accesses, uses, copies, modifies, distributes, links to, embeds, hosts, or +otherwise interacts with the Licensed Work. + +"Source Form" means the preferred form of the Licensed Work for reading, +maintaining, or making modifications. + +"Object Form" means any compiled, packaged, bundled, generated, minified, +containerized, hosted, or machine-executable form of the Licensed Work. + +"Modification" means any change, translation, adaptation, refactor, port, +extension, integration, plugin, compilation, packaging, or derivative work based +on the Licensed Work. + +"Public Project" means any repository, package, release, website, distribution, +service, application, marketplace listing, publication, or other disclosure made +available outside your own household or internal organization. + +"Commercial Use" means any use intended for, connected to, or resulting in +direct or indirect monetary value, including selling, licensing, hosting, +offering paid support, bundling with a paid product, including in a commercial +service, using in revenue-generating operations, using to provide professional +services, using in a product trial or freemium funnel, or using to gain a +commercial advantage. + +"Same-Functionality Product" means any software, service, product, script, +workflow, plugin, system, model, dataset, or hosted offering that reproduces, +substitutes for, markets against, or provides substantially similar core +functionality to the Licensed Work. The analysis depends on the purpose and +functionality of the original project, not only on implementation details. + +"First-Party Marks and Assets" means names, trademarks, service marks, logos, +icons, mascots, banners, artwork, screenshots, trade dress, product identities, +and branding owned or supplied by Licensor. + +2. Grant of Rights + +Subject to all terms of this license, Licensor grants you a worldwide, +royalty-free, non-exclusive, non-transferable, non-sublicensable, revocable +license to: + +a. view, read, inspect, and study the Licensed Work; + +b. run the Licensed Work for personal, internal, academic, research, evaluation, +accessibility, interoperability, and Commercial Use; + +c. copy the Licensed Work for backup, archival, testing, internal deployment, +permitted Commercial Use, and permitted distribution; + +d. create Modifications for personal, internal, academic, research, evaluation, +accessibility, interoperability, and Commercial Use; + +e. publish patches, bug reports, interoperability notes, benchmarks, security +findings, documentation, and compatibility layers related to the Licensed Work; + +f. distribute the unmodified Licensed Work or Modifications when the distribution +complies with this license, including the attribution, notice, branding, +third-party, and Same-Functionality Product restrictions; and + +g. link to, call, embed, automate, or interoperate with the Licensed Work when +your use complies with this license. + +No rights are granted except those expressly stated in this license. + +3. Attribution and Notice Requirements + +If you copy, modify, distribute, link to, embed, reference, host, or publish any +portion of the Licensed Work in Source Form or Object Form, you must: + +a. preserve all copyright, license, attribution, provenance, and third-party +notice files; + +b. include a clear notice identifying the Licensed Work, the Licensor, the +copyright notice, and the Scriptriva Public Source License 1.0; + +c. identify material Modifications you made; + +d. provide a copy of this license with the relevant copy, package, repository, +release, service, documentation, or distribution; + +e. avoid implying that Licensor sponsors, endorses, maintains, certifies, or is +affiliated with your project unless Licensor gives written permission; and + +f. preserve third-party notices and comply with third-party licenses for +components not owned by Licensor. + +For graphical applications, documentation sites, public demos, app stores, +release pages, hosted services, and packaged distributions, attribution must be +reasonably visible in at least one of: an About window, license page, notices +file, README, package metadata, documentation page, legal page, or equivalent +user-accessible location. + +4. Commercial Use + +Commercial Use is allowed under this license when it complies with all terms of +this license. You do not need a separate commercial license merely to use the +Licensed Work commercially, sell services that use the Licensed Work internally, +or include the Licensed Work in a larger commercial system, provided that your +use does not violate Sections 5, 6, 7, or any third-party license. + +5. Same-Functionality Product Restrictions + +You may not, without prior written permission from Licensor: + +a. sell, rent, lease, sublicense, monetize, host, or otherwise charge for access +to the Licensed Work or a Modification as a standalone Same-Functionality +Product; + +b. use the Licensed Work or a Modification to create, operate, improve, train, +benchmark, market, or distribute a Same-Functionality Product; + +c. rebrand the Licensed Work or a Modification as a competing product; + +d. offer the Licensed Work or a Modification as a hosted service, cloud service, +managed service, paid API, subscription product, marketplace product, paid +plugin, paid desktop application, or paid utility whose primary purpose is +substantially similar to the Licensed Work; or + +e. use the Licensed Work to train, fine-tune, evaluate, benchmark, or improve a +model, dataset, agent, or automation intended to compete with or replicate the +core functionality of the Licensed Work. + +This section does not prohibit ordinary Commercial Use permitted by Section 4, +interoperability, independent implementations that do not use the Licensed Work, +or public criticism, benchmarking, security review, or compatibility testing. + +6. Public Projects, Linking, and Modifications + +You may reference, link to, embed, or interoperate with the Licensed Work in a +Public Project, including a commercial Public Project, only if: + +a. the Public Project does not violate the Same-Functionality Product +restrictions in Section 5; + +b. the Public Project includes the attribution required by Section 3; + +c. your use does not make First-Party Marks and Assets appear to be your own; + +d. your repository, package, documentation, or notice file clearly identifies +which portions are yours and which portions are derived from, include, or +interface with the Licensed Work; and + +e. any Modification of the Licensed Work remains under this license unless +Licensor gives written permission for another license. + +7. Restrictions + +You may not: + +a. remove, obscure, or alter copyright, license, trademark, attribution, +provenance, or third-party notice files; + +b. use First-Party Marks and Assets in a way that suggests endorsement, +affiliation, certification, or ownership by you; + +c. redistribute First-Party Marks and Assets separately from the Licensed Work +unless Licensor gives written permission; + +d. use the Licensed Work to violate law, privacy rights, publicity rights, voice +rights, copyright, platform terms, or anti-circumvention rules; + +e. use the Licensed Work to clone, synthesize, imitate, capture, extract, or +process a person's voice, likeness, private data, credentials, or protected +content without appropriate rights or permission; + +f. use the Licensed Work in surveillance, credential theft, fraud, malware, +spyware, covert data extraction, or deceptive automation workflows; + +g. bypass, disable, remove, or misrepresent license notices, support notices, +security notices, safety checks, provenance indicators, or attribution flows; + +h. grant sublicenses or downstream rights broader than the rights granted to you +under this license; or + +i. use the Licensed Work in a manner that creates legal obligations for Licensor +or requires Licensor to license First-Party Marks and Assets, patents, trade +secrets, private data, or third-party materials. + +8. Contributions + +Unless Licensor signs a separate written agreement with you, any patch, pull +request, issue suggestion, documentation change, bug fix, code sample, asset, +or other contribution you submit to Licensor may be used, modified, relicensed, +sublicensed, distributed, and commercialized by Licensor without restriction or +payment to you. You represent that you have the right to submit the contribution. + +9. Trademarks and Branding + +This license does not grant trademark rights. Licensor retains all rights in +First-Party Marks and Assets. You may use the project name only as necessary for +accurate attribution, compatibility statements, or references to the unmodified +Licensed Work. + +10. Third-Party Components + +Third-party components, dependencies, binaries, models, voices, data files, +fonts, media, and tools are governed by their own licenses and terms. This +license does not change, replace, expand, or restrict rights you may have under +those third-party licenses. If a third-party license gives you broader rights +for that third-party component, those broader rights apply only to that +third-party component and not to the Licensed Work owned by Licensor. + +11. Additional Permissions + +Separate permissions, commercial redistribution permissions, +Same-Functionality Product permissions, branding permissions, trademark +permissions, and other exceptions may be available from Licensor. For this +project, support and licensing inquiries may be sent to support@scriptriva.com. + +12. Termination + +Your rights under this license terminate automatically if you violate any term +of this license. Upon termination, you must stop using, copying, modifying, +distributing, hosting, linking to, or otherwise exploiting the Licensed Work and +destroy or delete all copies under your control, except copies you are legally +required to retain for archival compliance. Sections intended to survive, +including attribution, restrictions, third-party notices, trademarks, +disclaimers, limitation of liability, and governing terms, survive termination. + +Licensor may reinstate your rights in writing or by accepting a cure, but +reinstatement is not automatic unless required by applicable law. + +13. No Warranty + +THE LICENSED WORK IS PROVIDED "AS IS" AND "AS AVAILABLE", WITHOUT WARRANTY OF +ANY KIND, EXPRESS, IMPLIED, STATUTORY, OR OTHERWISE, INCLUDING WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE, NON-INFRINGEMENT, +SECURITY, ACCURACY, AVAILABILITY, OR ERROR-FREE OPERATION. + +14. Limitation of Liability + +TO THE MAXIMUM EXTENT PERMITTED BY LAW, LICENSOR AND ITS OFFICERS, EMPLOYEES, +CONTRACTORS, CONTRIBUTORS, AND LICENSORS WILL NOT BE LIABLE FOR ANY INDIRECT, +INCIDENTAL, SPECIAL, CONSEQUENTIAL, EXEMPLARY, PUNITIVE, OR BUSINESS- +INTERRUPTION DAMAGES; LOST PROFITS; LOST REVENUE; LOST DATA; LOSS OF GOODWILL; +PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; OR CLAIMS ARISING FROM USE, +INABILITY TO USE, MODIFICATION, DISTRIBUTION, HOSTING, OR RELIANCE ON THE +LICENSED WORK, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. + +15. Compliance With Law + +You are responsible for complying with applicable laws, regulations, platform +terms, export controls, privacy rules, accessibility rules, and intellectual +property rights when using the Licensed Work. + +16. License Versions + +This license is version 1.0. Scriptriva Inc. may publish new versions of this +license. A file explicitly licensed under this version remains under this +version unless Licensor states otherwise in writing. + +17. Severability + +If any provision of this license is held unenforceable, the remaining provisions +remain in effect. The unenforceable provision will be interpreted as closely as +possible to its original purpose while remaining enforceable. + +18. Entire License + +This license is the complete agreement for the rights granted here. Any waiver, +exception, special permission, or alternate license must be in writing from +Licensor. diff --git a/README.md b/README.md new file mode 100644 index 0000000..457747c --- /dev/null +++ b/README.md @@ -0,0 +1,274 @@ +# Seshat TTS + +![Scriptriva Inc.](resources/banner.jpg) + +

+ Scriptriva Inc. logo +

+ +[![Python 3.10-3.14](https://img.shields.io/badge/python-3.10--3.14-3776ab?logo=python&logoColor=white)](https://www.python.org/) +[![Windows](https://img.shields.io/badge/platform-Windows-0078d4?logo=windows&logoColor=white)](https://www.microsoft.com/windows) +[![License: SPSL 1.0](https://img.shields.io/badge/license-SPSL%201.0-green.svg)](LICENSE) +[![Pocket TTS](https://img.shields.io/badge/voice-Kyutai%20Pocket%20TTS-111827)](https://github.com/kyutai-labs/pocket-tts) +[![Tesseract OCR](https://img.shields.io/badge/OCR-Tesseract-4b5563)](https://github.com/tesseract-ocr/tesseract) + +Seshat TTS is a Windows GUI utility for realtime audio streaming for games, or apps. Pick a monitor or window, drag one capture region over the text, press one hotkey, and the selected text is extracted with Tesseract OCR or a local vision LLM, then streamed through Kyutai Pocket TTS. + +Maintained by Scriptriva Inc. + +For support inquiries email: support@scriptriva.com +

+ Anime girl studying +

+ + +## What It Does + +- Captures one selected screen region from a monitor or a chosen window. +- Runs Tesseract OCR on that exact region, or sends the region image directly to a local vision-capable LLM for text extraction. +- Streams the extracted text through Pocket TTS in realtime. +- Lets you use a built-in Pocket TTS voice for speed or upload a custom WAV/MP3 reference voice. +- Optionally routes OCR text through a local OpenAI-compatible LLM endpoint before speech. +- Includes a 0-300% playback volume slider for quiet voices or noisy games. +- Stops any active audio stream when a new read starts, so repeated hotkey presses do not overlap. +- Caches custom voice state as `.safetensors` for faster repeat custom-voice reads when using the `uvx-server` backend. + +

+ Fast yapping mode +

+ +## Requirements + +- Windows 10/11. +- Python 3.10 through 3.14 when running from source or building. +- Tesseract OCR for Windows when running from source or building a portable EXE with bundled OCR. +- `uvx` when running from source, or when building a portable EXE with bundled uvx. +- A working audio output device. + +Install Tesseract: + +```powershell +winget install UB-Mannheim.TesseractOCR +``` + +Install `uvx`: + +```powershell +powershell -ExecutionPolicy ByPass -c "irm https://astral.sh/uv/install.ps1 | iex" +``` + +Install Seshat TTS for development or for the fast launcher: + +```powershell +python -m venv .venv +.\.venv\Scripts\Activate.ps1 +python -m pip install -e .[test] +``` + +## Build Before Use + +For a single-file portable EXE, build with: + +```powershell +.\scripts\build_exe.ps1 +``` + +Portable output: + +```powershell +.\dist\seshat-tts.exe +``` + +That EXE bundles the Seshat GUI/runtime files, app resources, `uvx.exe` if it is available on the build machine, and Tesseract OCR files if Tesseract is installed at `C:\Program Files\Tesseract-OCR`. You can override the OCR bundle source before building: + +```powershell +$env:SESHAT_TESSERACT_DIR='D:\Tools\Tesseract-OCR' +.\scripts\build_exe.ps1 +``` + +For the old one-folder PyInstaller build: + +```powershell +.\scripts\build_exe.ps1 -OneDir +``` + +One-folder output: + +```text +dist\seshat-tts\seshat-tts.exe +``` + +The portable EXE still uses Pocket TTS through `uvx-server`. It does not freeze Torch/Pocket TTS inside the EXE because that path has been unreliable on Windows and can trigger native DLL initialization failures. First Pocket TTS use can still download/cache the Pocket TTS tool and model data under the user's normal cache directories, but no separate Python, Tesseract, or uvx install should be needed when those files were bundled during build. + +For a tiny development launcher, build: + +```powershell +.\scripts\build_launcher_exe.ps1 +``` + +Launcher output: + +```text +dist\launcher\seshat-tts.exe +``` + +This launcher is intentionally small and quick to build. It uses the `.venv` in this project when present, so keep the virtual environment and installed dependencies beside the launcher. + +## Run From Source + +```powershell +seshat-tts +``` + +For the fast launcher EXE, run: + +```powershell +.\dist\launcher\seshat-tts.exe +``` + +The launcher expects dependencies in `.venv` or your active Python environment. It does not bundle Python, Torch, Pocket TTS, or Tesseract. + +## First-Time Setup + +1. Open Seshat TTS. +2. Choose `monitor` or `window` capture mode. +3. Select the monitor or window to watch. +4. Click `Select Region`, then drag over the exact text area to read. +5. Click inside `Read Hotkey` and press the key combo you want. The default is `ctrl+alt+n`. +6. Click inside `Region Hotkey` and press the key combo you want. The default is `ctrl+alt+r`. +7. Click inside `Stop Hotkey` and press the key combo you want. The default is `ctrl+alt+s`. +8. Set `Tesseract` if it was not detected automatically. +9. Choose a voice: + - `default` is fastest and uses a built-in Pocket TTS voice. + - `custom-wav` lets you choose a named WAV, MP3, or cached `.safetensors` reference voice. +10. Adjust `Volume` if the generated voice is too quiet. `100%` is neutral; values above that boost and clip safely. +11. Enable `Local LLM` if you want OCR text cleaned by a local OpenAI-compatible server before TTS. +12. Enable `Use local LLM vision instead of Tesseract OCR` only when your local model endpoint supports image input and you want the LLM to read the selected region directly. +13. Click `Preload TTS` once before playing if you want the first read to be less delayed. +14. Press the read hotkey whenever the selected text should be spoken, or the stop hotkey whenever playback should stop. + +Use borderless/windowed mode for games if exclusive fullscreen capture returns stale or blank frames. + +## Local LLM + +The `Local LLM` panel can use an OpenAI-compatible endpoint in two ways: + +- `Route OCR through local OpenAI-compatible LLM` keeps Tesseract as the text extractor, then asks the local model to clean the parsed text before TTS. +- `Use local LLM vision instead of Tesseract OCR` skips Tesseract and sends the selected region image to the local model as a PNG data URL. This requires a vision-capable OpenAI-compatible model endpoint. + +Typical values: + +```text +Base URL: http://127.0.0.1:8000/v1 +API Key: local key or token +Model: the model name exposed by your local server +``` + +`Load api_key.txt` fills the API key field from a repo-local `api_key.txt` file if present. Treat that file as a secret and do not commit it. Lower timeout and max token values reduce latency; no network or LLM path can be truly zero-latency, but a local endpoint keeps this as short as the model server allows. + +`Disable thinking` is enabled by default. It sends common OpenAI-compatible metadata for local reasoning models, including `chat_template_kwargs.enable_thinking=false`, so models that support that switch skip reasoning output and return faster. + +## Voice Modes + +`default` voice mode is the fastest. Pick a built-in voice such as `alba`, `marius`, `anna`, `vera`, or `george`. + +`custom-wav` mode accepts `.wav`, `.mp3`, and cached `.safetensors` voice files. MP3 references are converted once into cached WAV files before Pocket TTS processes them. Use `Manage` beside `Custom Voice` to name voices, save them, and select them from the dropdown. + +The first custom-voice run can be slow because Pocket TTS must convert the reference audio into a voice state. Seshat TTS caches that state under: + +```text +%USERPROFILE%\.seshat-tts\voices +``` + +After that cache exists, the `uvx-server` backend sends a reusable local `voice_url` instead of uploading and reprocessing the same audio every time. Named custom voices are stored in: + +```text +%USERPROFILE%\.seshat-tts\voice_profiles.json +``` + +Pocket TTS voice cloning may require Hugging Face access: + +1. Request access on [Kyutai's Pocket TTS Hugging Face page](https://huggingface.co/kyutai/pocket-tts). +2. Create a token at [Hugging Face tokens](https://huggingface.co/settings/tokens). +3. Login for `uvx`: + +```powershell +uvx hf auth login --force +``` + +## Build Commands + +Fast launcher build, usually under a minute: + +```powershell +.\scripts\build_launcher_exe.ps1 +``` + +Output: + +```text +dist\launcher\seshat-tts.exe +``` + +Full dependency-bundled PyInstaller build: + +```powershell +.\scripts\build_exe.ps1 +``` + +Output: + +```text +dist\seshat-tts.exe +``` + +Use the fast launcher during development and for local use. Use the portable build when you need to move the app to a machine where Python, Tesseract, and uvx are not installed. + +The `python-api` backend is only shown when running from source or the fast launcher. The bundled PyInstaller EXE only exposes `uvx-server`. + +## License and Reuse + +Seshat TTS is released under the [Scriptriva Public Source License 1.0](LICENSE). + +Copyright (c) 2026 Scriptriva Inc. + +Commercial use is allowed under the license terms. The license preserves attribution, third-party notices, Scriptriva branding rights, safety restrictions, and restrictions on reusing the licensed work to create or distribute a same-functionality product. + +Useful reuse boundaries: + +- `src/seshat_tts/capture.py`: monitor/window capture helpers. +- `src/seshat_tts/ocr.py`: OCR preprocessing and text extraction. +- `src/seshat_tts/tts.py`: Pocket TTS server/API playback adapters and stream cancellation. +- `src/seshat_tts/llm.py`: OpenAI-compatible local LLM cleanup step. +- `src/seshat_tts/config.py`: persisted GUI/runtime configuration. +- `src/seshat_tts/region_picker.py`: snipping-tool-style region selection. + +Security and privacy considerations for reuse: + +- Treat OCR text, API keys, custom voice files, and generated voice caches as user data. +- Do not commit `api_key.txt`, voice samples, `.safetensors` voice caches, or local config files. +- Custom voice cloning should be used only with audio you have permission to use. +- The portable EXE may bundle third-party binaries; keep their notices and license terms intact. + + + +## Third-Party Notices + +Seshat TTS uses and/or interfaces with these third-party projects. Each project remains under its own license: + +| Component | Purpose | License | Notes | +| --- | --- | --- | --- | +| [Kyutai Pocket TTS](https://github.com/kyutai-labs/pocket-tts) | Local text-to-speech generation and voice cloning | MIT | The Pocket TTS GitHub repository identifies the project as MIT licensed. Model/voice assets may have separate terms; review the linked Hugging Face pages before redistribution. | +| [Tesseract OCR](https://tesseractocr.org/) | OCR engine used to extract text from selected screen regions | Apache License 2.0 | Tesseract is not MIT licensed. Its project site identifies it as Apache 2.0 licensed. | +| [pytesseract](https://github.com/madmaze/pytesseract) | Python wrapper for Tesseract | Apache License 2.0 | Used to invoke the Tesseract executable from Python. | +| [PyInstaller](https://pyinstaller.org/) | Windows executable packaging | GPLv2-or-later with bootloader exception | Used only for building packaged executables. | +| [OpenAI Python SDK](https://github.com/openai/openai-python) | OpenAI-compatible local LLM client | Apache License 2.0 | Used for optional local LLM cleanup through OpenAI-compatible endpoints. | + +Packaged builds include [THIRD_PARTY_NOTICES.md](THIRD_PARTY_NOTICES.md), including a link to the [Pocket TTS MIT license](https://github.com/kyutai-labs/pocket-tts/blob/main/LICENSE). + +## Tests + +```powershell +$env:PYTHONPATH='src' +python -m pytest -q +``` diff --git a/SECURITY.md b/SECURITY.md new file mode 100644 index 0000000..5ad52d2 --- /dev/null +++ b/SECURITY.md @@ -0,0 +1,37 @@ +# Security Policy + +## Supported Versions + +The `main` branch is the only supported development line until formal releases begin. + +## Reporting a Vulnerability + +Email support@scriptriva.com with: + +- A concise description of the issue. +- Steps to reproduce. +- Affected version or commit. +- Impact and likely abuse path. +- Any proof-of-concept files or screenshots that are safe to share. + +Please do not disclose vulnerabilities publicly until maintainers have had time to investigate and coordinate a fix. + +## Sensitive Data + +Do not commit: + +- `api_key.txt` +- Hugging Face or OpenAI-compatible API tokens +- custom voice samples +- cached `.safetensors` voice states +- generated executable artifacts +- local config files + +## Security-Relevant Areas + +- Screen/window capture. +- OCR text handling. +- Local LLM endpoint configuration. +- Custom voice file handling. +- TTS server startup and subprocess execution. +- Packaged binary contents. diff --git a/SUPPORT.md b/SUPPORT.md new file mode 100644 index 0000000..65163f8 --- /dev/null +++ b/SUPPORT.md @@ -0,0 +1,22 @@ +# Support + +For support inquiries email: support@scriptriva.com + +## Before Asking + +- Check the README setup steps. +- Confirm Tesseract OCR is available or bundled. +- Confirm `uvx` can start Pocket TTS. +- Try the latest build from `main`. +- Include the exact error text shown in the GUI. + +## Useful Details + +When reporting a problem, include: + +- Windows version. +- App version or commit. +- Whether you use monitor or window capture. +- Whether you use default voice or custom voice. +- Whether Local LLM is enabled. +- Steps to reproduce. diff --git a/THIRD_PARTY_NOTICES.md b/THIRD_PARTY_NOTICES.md new file mode 100644 index 0000000..982df17 --- /dev/null +++ b/THIRD_PARTY_NOTICES.md @@ -0,0 +1,45 @@ +# Third-Party Notices + +Seshat TTS uses and/or interfaces with third-party software. These projects retain their own licenses. + +## Kyutai Pocket TTS + +Repository: https://github.com/kyutai-labs/pocket-tts + +License: MIT + +License file: https://github.com/kyutai-labs/pocket-tts/blob/main/LICENSE + +Pocket TTS is used for local text-to-speech generation and voice cloning through the `uvx-server` backend. The Pocket TTS software license is MIT. Model weights, hosted files, or voice assets may have separate terms; review the upstream model and hosting pages before redistributing those assets. + +## Tesseract OCR + +Project site: https://tesseractocr.org/ + +License: Apache License 2.0 + +Tesseract OCR is used to extract text from selected screen regions. + +## pytesseract + +Repository: https://github.com/madmaze/pytesseract + +License: Apache License 2.0 + +pytesseract is used as the Python wrapper around the Tesseract executable. + +## PyInstaller + +Project site: https://pyinstaller.org/ + +License: GPLv2-or-later with bootloader exception + +PyInstaller is used to build the Windows executable. + +## OpenAI Python SDK + +Repository: https://github.com/openai/openai-python + +License: Apache License 2.0 + +The OpenAI Python SDK is used to call optional OpenAI-compatible local LLM endpoints. diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md new file mode 100644 index 0000000..cae1e3c --- /dev/null +++ b/docs/ARCHITECTURE.md @@ -0,0 +1,33 @@ +# Architecture + +Seshat TTS is a Windows desktop utility that converts selected on-screen text into streamed speech. + +## Runtime Flow + +1. User selects a monitor or window. +2. User selects a screen region. +3. Hotkey triggers capture. +4. Tesseract OCR extracts text from the selected region, unless LLM image extraction is enabled. +5. Optional local OpenAI-compatible LLM cleans the OCR text, or a vision-capable local LLM extracts text directly from the captured region image. +6. Pocket TTS streams speech. +7. New hotkey captures stop any active stream and start a fresh one. +8. Stop hotkey cancels active playback without starting another capture. + +## Key Modules + +- `src/seshat_tts/app.py`: Tk GUI, settings flow, hotkey orchestration. +- `src/seshat_tts/capture.py`: monitor/window capture. +- `src/seshat_tts/region_picker.py`: snipping-tool-style region selection. +- `src/seshat_tts/ocr.py`: image preprocessing and Tesseract OCR. +- `src/seshat_tts/tts.py`: Pocket TTS API/server playback. +- `src/seshat_tts/llm.py`: OpenAI-compatible local LLM cleanup and vision-based image text extraction. +- `src/seshat_tts/config.py`: persisted config loading and migration. +- `src/seshat_tts/voices.py`: named custom voice profiles. + +## Packaging + +`scripts/build_exe.ps1` builds a portable PyInstaller executable using `seshat-tts-portable.spec`. + +The packaged EXE includes first-party resources, project license, third-party notices, bundled OCR files when Tesseract is installed on the build machine, and `uvx.exe` when found. + +Pocket TTS runs through `uvx-server` in bundled builds to avoid freezing Torch and its native dependencies into the app. diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..8d7b8ef --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,57 @@ +[project] +name = "seshat-tts" +version = "0.1.0" +description = "Hotkey-triggered screen OCR to Pocket TTS." +requires-python = ">=3.10,<3.15" +readme = "README.md" +license = { file = "LICENSE" } +authors = [ + { name = "Scriptriva Inc.", email = "support@scriptriva.com" }, +] +keywords = ["ocr", "tts", "accessibility", "screen-capture", "pocket-tts"] +classifiers = [ + "Development Status :: 3 - Alpha", + "Environment :: Win32 (MS Windows)", + "Intended Audience :: End Users/Desktop", + "Operating System :: Microsoft :: Windows", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.13", + "Topic :: Multimedia :: Sound/Audio :: Speech", +] +dependencies = [ + "keyboard>=0.13.5", + "mss>=9.0.1", + "pillow>=10.0.0", + "pytesseract>=0.3.13", + "openai>=1.0.0", + "imageio-ffmpeg>=0.5.1", + "requests>=2.20.0", + "sounddevice>=0.4.6", + "pocket-tts>=2.1.0", + "pywin32>=306; platform_system == 'Windows'", +] + +[project.optional-dependencies] +test = [ + "pytest>=8.0.0", +] +build = [ + "pyinstaller>=6.0.0", +] + +[project.scripts] +seshat-tts = "seshat_tts.app:main" + +[project.urls] +Homepage = "https://github.com/Scriptriva/seshat-tts" +Repository = "https://github.com/Scriptriva/seshat-tts" +Issues = "https://github.com/Scriptriva/seshat-tts/issues" +Support = "mailto:support@scriptriva.com" + +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[tool.hatch.build.targets.wheel] +packages = ["src/seshat_tts"] diff --git a/resources/anime_meme.gif b/resources/anime_meme.gif new file mode 100644 index 0000000..f38ecb9 Binary files /dev/null and b/resources/anime_meme.gif differ diff --git a/resources/anime_yapping.gif b/resources/anime_yapping.gif new file mode 100644 index 0000000..c21f014 Binary files /dev/null and b/resources/anime_yapping.gif differ diff --git a/resources/banner.jpg b/resources/banner.jpg new file mode 100644 index 0000000..b48e081 Binary files /dev/null and b/resources/banner.jpg differ diff --git a/resources/character.png b/resources/character.png new file mode 100644 index 0000000..97ff2e5 Binary files /dev/null and b/resources/character.png differ diff --git a/resources/logo.png b/resources/logo.png new file mode 100644 index 0000000..34339be Binary files /dev/null and b/resources/logo.png differ diff --git a/resources/seshat-tts.ico b/resources/seshat-tts.ico new file mode 100644 index 0000000..b9eaeee Binary files /dev/null and b/resources/seshat-tts.ico differ diff --git a/resources/seshat.png b/resources/seshat.png new file mode 100644 index 0000000..b480e20 Binary files /dev/null and b/resources/seshat.png differ diff --git a/scripts/build_exe.ps1 b/scripts/build_exe.ps1 new file mode 100644 index 0000000..a8acfca --- /dev/null +++ b/scripts/build_exe.ps1 @@ -0,0 +1,55 @@ +param( + [switch]$SkipInstall, + [switch]$OneDir +) + +$ErrorActionPreference = "Stop" +$Root = Split-Path -Parent $PSScriptRoot +Set-Location $Root +$DistApp = Join-Path $Root "dist\seshat-tts" +$DistExe = Join-Path $Root "dist\seshat-tts.exe" + +$RunningApps = @(Get-Process -Name "seshat-tts" -ErrorAction SilentlyContinue) +foreach ($Process in $RunningApps) { + try { + Stop-Process -Id $Process.Id -Force -ErrorAction Stop + } catch { + & taskkill.exe /PID $Process.Id /T /F | Out-Null + } +} +Start-Sleep -Milliseconds 500 +$StillRunning = @(Get-Process -Name "seshat-tts" -ErrorAction SilentlyContinue) +if ($StillRunning.Count -gt 0) { + $Ids = ($StillRunning | ForEach-Object { $_.Id }) -join ", " + throw "Close Seshat TTS before building. Could not stop running process id(s): $Ids" +} +foreach ($Target in @($DistApp, $DistExe)) { + if (Test-Path $Target) { + for ($Attempt = 1; $Attempt -le 5; $Attempt++) { + try { + Remove-Item -LiteralPath $Target -Recurse -Force + break + } catch { + if ($Attempt -eq 5) { + throw + } + Start-Sleep -Seconds 2 + } + } + } +} + +if (-not $SkipInstall) { + python -m pip install --upgrade pip + python -m pip install -e ".[build,test]" +} + +if ($OneDir) { + python -m PyInstaller --clean --noconfirm seshat-tts.spec + Write-Host "Built: $Root\dist\seshat-tts\seshat-tts.exe" +} else { + python -m PyInstaller --clean --noconfirm seshat-tts-portable.spec + Write-Host "Built portable EXE: $Root\dist\seshat-tts.exe" +} + +Write-Host "The portable EXE bundles the GUI runtime, bundled OCR files when Tesseract is installed on this build machine, and uvx.exe when found." diff --git a/scripts/build_launcher_exe.ps1 b/scripts/build_launcher_exe.ps1 new file mode 100644 index 0000000..99ef0b4 --- /dev/null +++ b/scripts/build_launcher_exe.ps1 @@ -0,0 +1,111 @@ +$ErrorActionPreference = "Stop" +$Root = Split-Path -Parent $PSScriptRoot +$OutDir = Join-Path $Root "dist\launcher" +$OutExe = Join-Path $OutDir "seshat-tts.exe" +$BuildDir = Join-Path $Root "build\launcher" +$Source = Join-Path $BuildDir "SeshatTtsLauncher.cs" +$Project = Join-Path $BuildDir "SeshatTtsLauncher.csproj" +$Icon = Join-Path $Root "resources\seshat-tts.ico" +$BuildIcon = Join-Path $BuildDir "seshat-tts.ico" + +New-Item -ItemType Directory -Force -Path $OutDir | Out-Null +New-Item -ItemType Directory -Force -Path $BuildDir | Out-Null +Get-ChildItem -Path (Join-Path $BuildDir "*") -File -Include "*.cs", "*.csproj" | Remove-Item -Force +foreach ($GeneratedDir in @("bin", "obj")) { + $Path = Join-Path $BuildDir $GeneratedDir + if (Test-Path $Path) { + Remove-Item -Path $Path -Recurse -Force + } +} +if (Test-Path $Icon) { + Copy-Item -Path $Icon -Destination $BuildIcon -Force +} + +@' +using System; +using System.Diagnostics; +using System.IO; + +public static class SeshatTtsLauncher +{ + public static int Main(string[] args) + { + string exeDir = AppDomain.CurrentDomain.BaseDirectory; + string root = Path.GetFullPath(Path.Combine(exeDir, "..", "..")); + string venvPythonw = Path.Combine(root, ".venv", "Scripts", "pythonw.exe"); + string venvPython = Path.Combine(root, ".venv", "Scripts", "python.exe"); + + string python = File.Exists(venvPythonw) ? venvPythonw : + File.Exists(venvPython) ? venvPython : + "pythonw.exe"; + + string arguments = "-m seshat_tts"; + if (args.Length > 0) + { + arguments += " " + string.Join(" ", Array.ConvertAll(args, Quote)); + } + + ProcessStartInfo start = new ProcessStartInfo(); + start.FileName = python; + start.Arguments = arguments; + start.WorkingDirectory = root; + start.UseShellExecute = false; + start.CreateNoWindow = true; + start.EnvironmentVariables["PYTHONPATH"] = Path.Combine(root, "src"); + + try + { + Process process = Process.Start(start); + return process == null ? 1 : 0; + } + catch + { + start.FileName = "python.exe"; + start.CreateNoWindow = false; + Process process = Process.Start(start); + return process == null ? 1 : 0; + } + } + + private static string Quote(string value) + { + if (value.IndexOfAny(new char[] {' ', '\t', '"'}) < 0) + { + return value; + } + return "\"" + value.Replace("\"", "\\\"") + "\""; + } +} +'@ | Set-Content -Path $Source -Encoding UTF8 + +$compiler = Get-Command csc.exe -ErrorAction SilentlyContinue +if ($compiler) { + $iconArg = if (Test-Path $Icon) { "/win32icon:$Icon" } else { $null } + & $compiler.Source /nologo /target:winexe /out:$OutExe $iconArg $Source +} else { + $dotnet = Get-Command dotnet.exe -ErrorAction SilentlyContinue + if (-not $dotnet) { + throw "No C# compiler found. Install the .NET SDK or add csc.exe to PATH." + } + +@' + + + WinExe + net10.0-windows + disable + disable + seshat-tts + seshat-tts.ico + + + + + +'@ | Set-Content -Path $Project -Encoding UTF8 + + dotnet publish $Project -c Release -o $OutDir --nologo +} + +Write-Host "Built fast launcher: $OutExe" +Write-Host "This launcher expects dependencies installed in .venv or the active Python environment." diff --git a/scripts/pyinstaller_entry.py b/scripts/pyinstaller_entry.py new file mode 100644 index 0000000..d6d7d99 --- /dev/null +++ b/scripts/pyinstaller_entry.py @@ -0,0 +1,7 @@ +from __future__ import annotations + +from seshat_tts.app import main + + +if __name__ == "__main__": + main() diff --git a/seshat-tts-portable.spec b/seshat-tts-portable.spec new file mode 100644 index 0000000..50ee0d7 --- /dev/null +++ b/seshat-tts-portable.spec @@ -0,0 +1,94 @@ +# -*- mode: python ; coding: utf-8 -*- + +import os +import shutil +from pathlib import Path + +ROOT = Path.cwd() + +datas = [] +binaries = [] + +resources = ROOT / "resources" +if resources.exists(): + datas.append((str(resources), "resources")) + +license_file = ROOT / "LICENSE" +if license_file.exists(): + datas.append((str(license_file), ".")) + +third_party_notices = ROOT / "THIRD_PARTY_NOTICES.md" +if third_party_notices.exists(): + datas.append((str(third_party_notices), ".")) + +tesseract_dir = Path(os.environ.get("SESHAT_TESSERACT_DIR", r"C:\Program Files\Tesseract-OCR")) +if tesseract_dir.exists(): + datas.append((str(tesseract_dir), "tesseract")) + +for tool_name in ("uvx.exe", "uv.exe"): + tool = shutil.which(tool_name) + if tool: + binaries.append((tool, "tools")) + +a = Analysis( + ["scripts/pyinstaller_entry.py"], + pathex=[str(ROOT / "src")], + binaries=binaries, + datas=datas, + hiddenimports=[ + "pytesseract", + "mss", + "keyboard", + "sounddevice", + "imageio_ffmpeg", + "openai", + "win32gui", + "win32con", + "win32ui", + ], + hookspath=[], + hooksconfig={}, + runtime_hooks=[], + excludes=[ + "pocket_tts", + "torch", + "torchaudio", + "torchvision", + "torchao", + "xformers", + "triton", + "bitsandbytes", + "pandas", + "scipy", + "matplotlib", + "pyarrow", + "numba", + "llvmlite", + "pytest", + "IPython", + "jupyter", + ], + noarchive=False, + optimize=0, +) +pyz = PYZ(a.pure) + +exe = EXE( + pyz, + a.scripts, + a.binaries, + a.datas, + [], + name="seshat-tts", + debug=False, + bootloader_ignore_signals=False, + strip=False, + upx=True, + console=False, + disable_windowed_traceback=False, + argv_emulation=False, + target_arch=None, + codesign_identity=None, + entitlements_file=None, + icon=str(ROOT / "resources" / "seshat-tts.ico"), +) diff --git a/seshat-tts.spec b/seshat-tts.spec new file mode 100644 index 0000000..1d9718e --- /dev/null +++ b/seshat-tts.spec @@ -0,0 +1,80 @@ +# -*- mode: python ; coding: utf-8 -*- + +from pathlib import Path + +ROOT = Path.cwd() + +a = Analysis( + ["scripts/pyinstaller_entry.py"], + pathex=[str(ROOT / "src")], + binaries=[], + datas=( + ([(str(ROOT / "resources"), "resources")] if (ROOT / "resources").exists() else []) + + ([(str(ROOT / "LICENSE"), ".")] if (ROOT / "LICENSE").exists() else []) + + ([(str(ROOT / "THIRD_PARTY_NOTICES.md"), ".")] if (ROOT / "THIRD_PARTY_NOTICES.md").exists() else []) + ), + hiddenimports=[ + "pytesseract", + "mss", + "keyboard", + "sounddevice", + "imageio_ffmpeg", + "openai", + "win32gui", + "win32con", + "win32ui", + ], + hookspath=[], + hooksconfig={}, + runtime_hooks=[], + excludes=[ + "pocket_tts", + "torch", + "torchaudio", + "torchvision", + "torchao", + "xformers", + "triton", + "bitsandbytes", + "pandas", + "scipy", + "matplotlib", + "pyarrow", + "numba", + "llvmlite", + "pytest", + "IPython", + "jupyter", + ], + noarchive=False, + optimize=0, +) +pyz = PYZ(a.pure) + +exe = EXE( + pyz, + a.scripts, + [], + exclude_binaries=True, + name="seshat-tts", + debug=False, + bootloader_ignore_signals=False, + strip=False, + upx=True, + console=False, + disable_windowed_traceback=False, + argv_emulation=False, + target_arch=None, + codesign_identity=None, + entitlements_file=None, + icon=str(ROOT / "resources" / "seshat-tts.ico"), +) +coll = COLLECT( + exe, + a.binaries, + a.datas, + strip=False, + upx=True, + upx_exclude=[], + name="seshat-tts", +) diff --git a/src/seshat_tts/__init__.py b/src/seshat_tts/__init__.py new file mode 100644 index 0000000..5bd4f3c --- /dev/null +++ b/src/seshat_tts/__init__.py @@ -0,0 +1,5 @@ +"""Hotkey OCR capture to Pocket TTS.""" + +__all__ = ["__version__"] + +__version__ = "0.1.0" diff --git a/src/seshat_tts/__main__.py b/src/seshat_tts/__main__.py new file mode 100644 index 0000000..4668130 --- /dev/null +++ b/src/seshat_tts/__main__.py @@ -0,0 +1,6 @@ +from .app import main + + +if __name__ == "__main__": + main() + diff --git a/src/seshat_tts/app.py b/src/seshat_tts/app.py new file mode 100644 index 0000000..5360ab1 --- /dev/null +++ b/src/seshat_tts/app.py @@ -0,0 +1,1228 @@ +from __future__ import annotations + +import ctypes +import os +import sys +import threading +import tkinter as tk +import webbrowser +from pathlib import Path +from tkinter import filedialog, messagebox, simpledialog, ttk + +from PIL import Image, ImageTk + +from .capture import capture_monitor_region, capture_window_region, list_monitors +from .config import AppConfig, Rect, load_config, save_config +from .hotkeys import HotkeyManager, listen_for_hotkey +from .llm import load_api_key_file, process_image_with_llm, process_text_with_llm +from .ocr import extract_ocr_text +from .region_picker import RegionPicker +from .resources import resource_path +from .tesseract import find_tesseract +from .tts import PocketTTSStreamer, UvxPocketTTSServer +from .voices import VoiceProfile, load_voice_profiles, save_voice_profiles, upsert_voice_profile, voice_profile_by_name +from .windows import WindowInfo, find_window_by_title, list_visible_windows + +DEFAULT_VOICES = [ + "alba", + "marius", + "javert", + "jean", + "anna", + "vera", + "fantine", + "charles", + "paul", + "george", + "mary", + "jane", + "michael", + "eve", + "giovanni", + "lola", + "juergen", + "rafael", + "estelle", +] + + +class SeshatTtsApp(tk.Tk): + def __init__(self) -> None: + _set_windows_app_user_model_id() + super().__init__() + self.title("Seshat TTS") + self.geometry("1060x920") + self.minsize(900, 720) + self.resizable(True, True) + self.configure(bg="#07090d") + self._set_window_icon() + + self.config_model = load_config() + self.voice_profiles = load_voice_profiles() + self.hotkeys = HotkeyManager() + self.tts: PocketTTSStreamer | UvxPocketTTSServer | None = None + self._recording_hotkey = False + self._capture_lock = threading.Lock() + self._monitor_values: dict[str, int] = {} + self._window_values: dict[str, WindowInfo] = {} + self._responsive_labels: list[tk.Widget] = [] + + self._configure_theme() + self._build_ui() + self._load_values() + self._refresh_targets() + self._register_hotkey() + self.after(250, self._poll_tts_status) + self.protocol("WM_DELETE_WINDOW", self._on_close) + + def _set_window_icon(self) -> None: + icon_path = resource_path("resources/seshat-tts.ico") + png_path = resource_path("resources/character.png") + try: + if icon_path.exists(): + self.iconbitmap(str(icon_path)) + self.iconbitmap(default=str(icon_path)) + self.wm_iconbitmap(str(icon_path)) + if png_path.exists(): + image = Image.open(png_path).convert("RGBA").resize((256, 256), Image.Resampling.LANCZOS) + self._icon_photo = ImageTk.PhotoImage(image) + self.iconphoto(True, self._icon_photo) + except (tk.TclError, OSError): + pass + + def _build_ui(self) -> None: + shell = ttk.Frame(self) + shell.pack(fill=tk.BOTH, expand=True) + self._scroll_canvas = tk.Canvas(shell, bg="#07090d", highlightthickness=0) + scrollbar = ttk.Scrollbar(shell, orient=tk.VERTICAL, command=self._scroll_canvas.yview) + self._scroll_canvas.configure(yscrollcommand=scrollbar.set) + scrollbar.pack(side=tk.RIGHT, fill=tk.Y) + self._scroll_canvas.pack(side=tk.LEFT, fill=tk.BOTH, expand=True) + root = ttk.Frame(self._scroll_canvas, padding=22) + self._scroll_window = self._scroll_canvas.create_window((0, 0), window=root, anchor=tk.NW) + root.bind("", self._on_scroll_content_configure) + self._scroll_canvas.bind("", self._on_scroll_canvas_configure) + self._scroll_canvas.bind("", lambda _event: self._bind_canvas_scroll()) + self._scroll_canvas.bind("", lambda _event: self._unbind_canvas_scroll()) + root.bind("", lambda _event: self._bind_canvas_scroll()) + + header = ttk.Frame(root, style="Hero.TFrame", padding=18) + header.pack(fill=tk.X, pady=(0, 18)) + header_icon = self._load_header_icon() + if header_icon is not None: + ttk.Label(header, image=header_icon, style="HeroIcon.TLabel").pack(side=tk.LEFT, padx=(0, 16)) + header_text = ttk.Frame(header, style="Hero.TFrame") + header_text.pack(side=tk.LEFT, fill=tk.X, expand=True) + ttk.Label(header_text, text="Seshat TTS", style="Title.TLabel").pack(anchor=tk.W) + self._wrap_label( + header_text, + text="One-hotkey OCR capture with Pocket TTS playback", + style="Subtitle.TLabel", + ).pack(anchor=tk.W, fill=tk.X, pady=(2, 0)) + self._wrap_label( + header_text, + text="Developed by Scriptriva Inc. | support@scriptriva.com", + style="Meta.TLabel", + ).pack(anchor=tk.W, fill=tk.X, pady=(6, 0)) + ttk.Button(header, text="i", width=3, command=self.show_about, style="IconButton.TButton").pack( + side=tk.RIGHT, + padx=(12, 0), + ) + + capture_frame = self._section(root, "Capture", "Choose the active window or monitor and bind hotkeys.") + capture_frame.columnconfigure(1, weight=1) + + ttk.Label(capture_frame, text="Mode", style="Field.TLabel").grid(row=0, column=0, sticky=tk.W, padx=8, pady=7) + self.capture_mode_var = tk.StringVar() + mode_box = ttk.Combobox( + capture_frame, + textvariable=self.capture_mode_var, + state="readonly", + values=["monitor", "window"], + ) + mode_box.grid(row=0, column=1, sticky=tk.EW, padx=8, pady=7) + + ttk.Label(capture_frame, text="Monitor", style="Field.TLabel").grid(row=1, column=0, sticky=tk.W, padx=8, pady=7) + self.monitor_var = tk.StringVar() + self.monitor_box = ttk.Combobox(capture_frame, textvariable=self.monitor_var, state="readonly") + self.monitor_box.grid(row=1, column=1, sticky=tk.EW, padx=8, pady=7) + ttk.Button(capture_frame, text="Refresh", command=self._refresh_targets).grid(row=1, column=2, padx=8, pady=7) + + ttk.Label(capture_frame, text="Window", style="Field.TLabel").grid(row=2, column=0, sticky=tk.W, padx=8, pady=7) + self.window_var = tk.StringVar() + self.window_box = ttk.Combobox(capture_frame, textvariable=self.window_var, state="readonly") + self.window_box.grid(row=2, column=1, sticky=tk.EW, padx=8, pady=7) + + ttk.Label(capture_frame, text="Read Hotkey", style="Field.TLabel").grid(row=3, column=0, sticky=tk.W, padx=8, pady=7) + self.hotkey_var = tk.StringVar() + self.hotkey_entry = self._hotkey_entry(capture_frame, self.hotkey_var, "read") + self.hotkey_entry.grid(row=3, column=1, sticky=tk.EW, padx=8, pady=7) + + ttk.Label(capture_frame, text="Region Hotkey", style="Field.TLabel").grid(row=4, column=0, sticky=tk.W, padx=8, pady=7) + self.capture_region_hotkey_var = tk.StringVar() + self.capture_region_hotkey_entry = self._hotkey_entry( + capture_frame, self.capture_region_hotkey_var, "capture-region" + ) + self.capture_region_hotkey_entry.grid(row=4, column=1, sticky=tk.EW, padx=8, pady=7) + + ttk.Label(capture_frame, text="Stop Hotkey", style="Field.TLabel").grid(row=5, column=0, sticky=tk.W, padx=8, pady=7) + self.stop_hotkey_var = tk.StringVar() + self.stop_hotkey_entry = self._hotkey_entry(capture_frame, self.stop_hotkey_var, "stop-playback") + self.stop_hotkey_entry.grid(row=5, column=1, sticky=tk.EW, padx=8, pady=7) + + ttk.Button(capture_frame, text="Apply", command=self._save_and_register, style="Accent.TButton").grid( + row=5, column=2, padx=8, pady=7 + ) + + rect_frame = self._section(root, "Capture Region", "Drag a precise reading region over the text to parse.") + for col in range(8): + rect_frame.columnconfigure(col, weight=1) + + self.left_var = tk.IntVar() + self.top_var = tk.IntVar() + self.width_var = tk.IntVar() + self.height_var = tk.IntVar() + self._number_field(rect_frame, "Left", self.left_var, 0, 0) + self._number_field(rect_frame, "Top", self.top_var, 0, 2) + self._number_field(rect_frame, "Width", self.width_var, 0, 4) + self._number_field(rect_frame, "Height", self.height_var, 0, 6) + ttk.Button(rect_frame, text="Select Region", command=self.select_region, style="Accent.TButton").grid( + row=1, column=0, columnspan=2, sticky=tk.W, padx=8, pady=6 + ) + + tts_frame = self._section(root, "OCR and Voice", "Select OCR, voice, playback, and backend settings.") + tts_frame.columnconfigure(1, weight=1) + + ttk.Label(tts_frame, text="Tesseract", style="Field.TLabel").grid(row=0, column=0, sticky=tk.W, padx=8, pady=6) + self.tesseract_var = tk.StringVar() + ttk.Entry(tts_frame, textvariable=self.tesseract_var).grid(row=0, column=1, sticky=tk.EW, padx=8, pady=6) + tesseract_buttons = ttk.Frame(tts_frame) + tesseract_buttons.grid(row=0, column=2, sticky=tk.E, padx=8, pady=6) + ttk.Button(tesseract_buttons, text="Detect", command=self._detect_tesseract).pack(side=tk.LEFT) + ttk.Button(tesseract_buttons, text="Browse", command=self._browse_tesseract).pack(side=tk.LEFT, padx=(6, 0)) + + ttk.Label(tts_frame, text="Voice Source", style="Field.TLabel").grid(row=1, column=0, sticky=tk.W, padx=8, pady=6) + self.voice_source_var = tk.StringVar() + ttk.Combobox( + tts_frame, + textvariable=self.voice_source_var, + state="readonly", + values=["default", "custom-wav"], + ).grid(row=1, column=1, sticky=tk.EW, padx=8, pady=6) + + ttk.Label(tts_frame, text="Default Voice", style="Field.TLabel").grid(row=2, column=0, sticky=tk.W, padx=8, pady=6) + self.default_voice_var = tk.StringVar() + ttk.Combobox( + tts_frame, + textvariable=self.default_voice_var, + state="readonly", + values=DEFAULT_VOICES, + ).grid(row=2, column=1, sticky=tk.EW, padx=8, pady=6) + + ttk.Label(tts_frame, text="Custom Voice", style="Field.TLabel").grid(row=3, column=0, sticky=tk.W, padx=8, pady=6) + self.custom_voice_name_var = tk.StringVar() + self.custom_voice_box = ttk.Combobox( + tts_frame, + textvariable=self.custom_voice_name_var, + state="readonly", + ) + self.custom_voice_box.grid(row=3, column=1, sticky=tk.EW, padx=8, pady=6) + self.custom_voice_box.bind("<>", lambda _event: self._apply_selected_voice_profile()) + voice_buttons = ttk.Frame(tts_frame) + voice_buttons.grid(row=3, column=2, sticky=tk.E, padx=8, pady=6) + ttk.Button(voice_buttons, text="Manage", command=self.open_voice_manager).pack(side=tk.LEFT) + + ttk.Label(tts_frame, text="Voice File", style="Field.TLabel").grid(row=4, column=0, sticky=tk.W, padx=8, pady=6) + self.voice_var = tk.StringVar() + ttk.Entry(tts_frame, textvariable=self.voice_var).grid(row=4, column=1, sticky=tk.EW, padx=8, pady=6) + ttk.Button(tts_frame, text="Browse", command=self._browse_voice).grid(row=4, column=2, padx=8, pady=6) + + ttk.Label(tts_frame, text="Language", style="Field.TLabel").grid(row=5, column=0, sticky=tk.W, padx=8, pady=6) + self.language_var = tk.StringVar() + ttk.Entry(tts_frame, textvariable=self.language_var, state="readonly").grid( + row=5, + column=1, + sticky=tk.EW, + padx=8, + pady=6, + ) + self.quantize_var = tk.BooleanVar() + ttk.Checkbutton(tts_frame, text="Quantize", variable=self.quantize_var).grid( + row=5, column=2, sticky=tk.W, padx=8, pady=6 + ) + + ttk.Label(tts_frame, text="Volume", style="Field.TLabel").grid(row=6, column=0, sticky=tk.W, padx=8, pady=6) + self.volume_var = tk.DoubleVar(value=100.0) + volume_frame = ttk.Frame(tts_frame) + volume_frame.grid(row=6, column=1, sticky=tk.EW, padx=8, pady=6) + volume_frame.columnconfigure(0, weight=1) + ttk.Scale( + volume_frame, + variable=self.volume_var, + from_=0, + to=300, + command=lambda _value: self._update_volume_label(), + ).grid(row=0, column=0, sticky=tk.EW) + self.volume_label_var = tk.StringVar(value="100%") + ttk.Label(volume_frame, textvariable=self.volume_label_var, width=6).grid(row=0, column=1, padx=(8, 0)) + + ttk.Label(tts_frame, text="Backend", style="Field.TLabel").grid(row=7, column=0, sticky=tk.W, padx=8, pady=6) + self.tts_backend_var = tk.StringVar() + ttk.Combobox( + tts_frame, + textvariable=self.tts_backend_var, + state="readonly", + values=_tts_backend_options(), + ).grid(row=7, column=1, sticky=tk.EW, padx=8, pady=6) + + server_frame = ttk.Frame(tts_frame) + server_frame.grid(row=7, column=2, sticky=tk.E, padx=8, pady=6) + self.tts_port_var = tk.IntVar() + ttk.Label(server_frame, text="Port", style="Field.TLabel").pack(side=tk.LEFT) + ttk.Entry(server_frame, textvariable=self.tts_port_var, width=6).pack(side=tk.LEFT, padx=(6, 0)) + + llm_frame = self._section( + root, + "Local LLM", + "Optional OpenAI-compatible text cleanup or image-based text extraction before speech.", + expanded=False, + ) + llm_frame.columnconfigure(1, weight=1) + + self.llm_enabled_var = tk.BooleanVar() + self._wrap_checkbutton(llm_frame, text="Route OCR through local OpenAI-compatible LLM", variable=self.llm_enabled_var).grid( + row=0, column=0, columnspan=3, sticky=tk.W, padx=8, pady=6 + ) + + ttk.Label(llm_frame, text="Base URL", style="Field.TLabel").grid(row=1, column=0, sticky=tk.W, padx=8, pady=6) + self.llm_base_url_var = tk.StringVar() + ttk.Entry(llm_frame, textvariable=self.llm_base_url_var).grid(row=1, column=1, sticky=tk.EW, padx=8, pady=6) + + ttk.Label(llm_frame, text="API Key", style="Field.TLabel").grid(row=2, column=0, sticky=tk.W, padx=8, pady=6) + self.llm_api_key_var = tk.StringVar() + ttk.Entry(llm_frame, textvariable=self.llm_api_key_var, show="*").grid( + row=2, column=1, sticky=tk.EW, padx=8, pady=6 + ) + ttk.Button(llm_frame, text="Load api_key.txt", command=self._load_llm_api_key_file).grid( + row=2, column=2, padx=8, pady=6 + ) + + ttk.Label(llm_frame, text="Model", style="Field.TLabel").grid(row=3, column=0, sticky=tk.W, padx=8, pady=6) + self.llm_model_var = tk.StringVar() + ttk.Entry(llm_frame, textvariable=self.llm_model_var).grid(row=3, column=1, sticky=tk.EW, padx=8, pady=6) + llm_limits = ttk.Frame(llm_frame) + llm_limits.grid(row=3, column=2, sticky=tk.E, padx=8, pady=6) + self.llm_timeout_var = tk.DoubleVar() + self.llm_max_tokens_var = tk.IntVar() + self.llm_disable_thinking_var = tk.BooleanVar() + ttk.Label(llm_limits, text="Timeout", style="Field.TLabel").pack(side=tk.LEFT) + ttk.Entry(llm_limits, textvariable=self.llm_timeout_var, width=5).pack(side=tk.LEFT, padx=(6, 10)) + ttk.Label(llm_limits, text="Tokens", style="Field.TLabel").pack(side=tk.LEFT) + ttk.Entry(llm_limits, textvariable=self.llm_max_tokens_var, width=5).pack(side=tk.LEFT, padx=(6, 0)) + ttk.Checkbutton(llm_frame, text="Disable thinking", variable=self.llm_disable_thinking_var).grid( + row=4, column=0, columnspan=3, sticky=tk.W, padx=8, pady=6 + ) + self.llm_image_extraction_var = tk.BooleanVar() + self._wrap_checkbutton( + llm_frame, + text="Use local LLM vision instead of Tesseract OCR", + variable=self.llm_image_extraction_var, + ).grid(row=5, column=0, columnspan=3, sticky=tk.W, padx=8, pady=6) + + ttk.Label(llm_frame, text="Prompt", style="Field.TLabel").grid(row=6, column=0, sticky=tk.NW, padx=8, pady=6) + self.llm_prompt_box = tk.Text( + llm_frame, + height=3, + wrap=tk.WORD, + bg="#0b0f14", + fg="#f4f7fb", + insertbackground="#f4f7fb", + relief=tk.FLAT, + highlightthickness=1, + highlightbackground="#263241", + highlightcolor="#4ea1ff", + padx=8, + pady=6, + font=("Segoe UI", 10), + ) + self.llm_prompt_box.grid(row=6, column=1, columnspan=2, sticky=tk.EW, padx=8, pady=6) + + actions = ttk.Frame(root, style="ActionBar.TFrame", padding=(12, 12)) + actions.pack(fill=tk.X, pady=(0, 12)) + ttk.Button(actions, text="Save Settings", command=self._save_and_register).pack(side=tk.LEFT) + ttk.Button(actions, text="Capture Now", command=self.capture_now, style="Accent.TButton").pack( + side=tk.LEFT, padx=8 + ) + ttk.Button(actions, text="Preload TTS", command=self.preload_tts).pack(side=tk.LEFT) + ttk.Button(actions, text="Test TTS", command=self.test_tts).pack(side=tk.LEFT, padx=8) + + self.status_var = tk.StringVar(value="Ready.") + ttk.Label(root, textvariable=self.status_var, style="Status.TLabel").pack(fill=tk.X, pady=(0, 12)) + + output_frame = self._section(root, "Last Extracted Text", "Parsed text and long status messages.", expanded=True) + output_frame.rowconfigure(0, weight=1) + output_frame.columnconfigure(0, weight=1) + self.text_box = tk.Text( + output_frame, + height=12, + wrap=tk.WORD, + bg="#0b0f14", + fg="#f4f7fb", + insertbackground="#f4f7fb", + relief=tk.FLAT, + highlightthickness=1, + highlightbackground="#263241", + highlightcolor="#4ea1ff", + padx=12, + pady=10, + font=("Segoe UI", 10), + ) + self.text_box.grid(row=0, column=0, sticky=tk.NSEW) + + def _on_scroll_content_configure(self, _event: tk.Event) -> None: + self._scroll_canvas.configure(scrollregion=self._scroll_canvas.bbox("all")) + + def _on_scroll_canvas_configure(self, event: tk.Event) -> None: + self._scroll_canvas.itemconfigure(self._scroll_window, width=event.width) + wraplength = max(260, event.width - 180) + for label in self._responsive_labels: + try: + label.configure(wraplength=wraplength) + except tk.TclError: + pass + + def _section( + self, + parent: ttk.Frame, + title: str, + description: str = "", + *, + expanded: bool = True, + ) -> ttk.Frame: + outer = ttk.Frame(parent, style="Card.TFrame", padding=1) + outer.pack(fill=tk.X, pady=(0, 14)) + header = ttk.Frame(outer, style="CardHeader.TFrame", padding=(14, 12)) + header.pack(fill=tk.X) + header.columnconfigure(0, weight=1) + title_stack = ttk.Frame(header, style="CardHeader.TFrame") + title_stack.grid(row=0, column=0, sticky=tk.EW) + ttk.Label(title_stack, text=title, style="CardTitle.TLabel").pack(anchor=tk.W) + if description: + self._wrap_label(title_stack, text=description, style="CardSubtitle.TLabel").pack( + anchor=tk.W, + fill=tk.X, + pady=(3, 0), + ) + body = ttk.Frame(outer, style="CardBody.TFrame", padding=(14, 10, 14, 14)) + + def toggle() -> None: + if body.winfo_manager(): + body.pack_forget() + toggle_button.configure(text="Show") + else: + body.pack(fill=tk.X) + toggle_button.configure(text="Hide") + self._scroll_canvas.configure(scrollregion=self._scroll_canvas.bbox("all")) + + toggle_button = ttk.Button( + header, + text="Hide" if expanded else "Show", + width=7, + command=toggle, + style="Secondary.TButton", + ) + toggle_button.grid(row=0, column=1, sticky=tk.NE, padx=(12, 0)) + if expanded: + body.pack(fill=tk.X) + return body + + def _wrap_label(self, parent: ttk.Frame, **kwargs: object) -> ttk.Label: + label = ttk.Label(parent, **kwargs) + self._responsive_labels.append(label) + return label + + def _wrap_checkbutton(self, parent: ttk.Frame, **kwargs: object) -> ttk.Checkbutton: + return ttk.Checkbutton(parent, **kwargs) + + def _on_mousewheel(self, event: tk.Event) -> None: + widget_class = event.widget.winfo_class() + if widget_class in {"Listbox", "TCombobox"}: + return + self._scroll_canvas.yview_scroll(int(-1 * (event.delta / 120)), "units") + + def _bind_canvas_scroll(self) -> None: + self._scroll_canvas.bind_all("", self._on_mousewheel) + + def _unbind_canvas_scroll(self) -> None: + self._scroll_canvas.unbind_all("") + + def _configure_theme(self) -> None: + style = ttk.Style(self) + style.theme_use("clam") + self.option_add("*Font", ("Segoe UI", 10)) + self.option_add("*TCombobox*Listbox.background", "#101721") + self.option_add("*TCombobox*Listbox.foreground", "#f4f7fb") + self.option_add("*TCombobox*Listbox.selectBackground", "#255f99") + self.option_add("*TCombobox*Listbox.selectForeground", "#ffffff") + + bg = "#07090d" + panel = "#101721" + panel_alt = "#151d28" + border = "#263241" + border_soft = "#1b2734" + text = "#f4f7fb" + muted = "#99a6b8" + muted_deep = "#6f7f92" + accent = "#4ea1ff" + accent_active = "#74b8ff" + + style.configure(".", background=bg, foreground=text, bordercolor=border, lightcolor=border, darkcolor=border) + style.configure("TFrame", background=bg) + style.configure("Hero.TFrame", background="#0d131d", borderwidth=1, relief=tk.SOLID) + style.configure("Card.TFrame", background=border_soft, borderwidth=1, relief=tk.SOLID) + style.configure("CardHeader.TFrame", background="#111a26") + style.configure("CardBody.TFrame", background=panel) + style.configure("ActionBar.TFrame", background="#0d131d", borderwidth=1, relief=tk.SOLID) + style.configure("TLabel", background=bg, foreground=text) + style.configure("HeroIcon.TLabel", background="#0d131d") + style.configure("Title.TLabel", background="#0d131d", foreground=text, font=("Segoe UI Semibold", 23)) + style.configure("Subtitle.TLabel", background="#0d131d", foreground=muted, font=("Segoe UI", 10)) + style.configure("Meta.TLabel", background="#0d131d", foreground=muted_deep, font=("Segoe UI", 9)) + style.configure("Section.TLabel", background=bg, foreground=text, font=("Segoe UI Semibold", 10)) + style.configure("Status.TLabel", background="#0c131c", foreground="#c9d5e3", padding=(10, 7)) + style.configure("CardTitle.TLabel", background="#111a26", foreground=text, font=("Segoe UI Semibold", 12)) + style.configure("CardSubtitle.TLabel", background="#111a26", foreground=muted, font=("Segoe UI", 9)) + style.configure("Field.TLabel", background=panel, foreground="#dce7f4", font=("Segoe UI Semibold", 9)) + style.configure("AboutPanel.TFrame", background=bg) + style.configure("AboutFooter.TFrame", background=bg) + style.configure("AboutTitle.TLabel", background=bg, foreground=text, font=("Segoe UI Semibold", 20)) + style.configure("AboutSubtitle.TLabel", background=bg, foreground=muted, font=("Segoe UI", 10)) + style.configure("AboutSection.TLabel", background=bg, foreground=text, font=("Segoe UI Semibold", 10)) + style.configure("AboutBody.TLabel", background=bg, foreground=text, font=("Segoe UI", 10)) + style.configure("AboutLink.TLabel", background=bg, foreground="#74b8ff", font=("Segoe UI", 10)) + + style.configure("TLabelframe", background=panel, foreground=text, bordercolor=border, relief=tk.SOLID) + style.configure("TLabelframe.Label", background=bg, foreground=text, font=("Segoe UI Semibold", 10)) + style.configure( + "TEntry", + fieldbackground="#0b0f14", + foreground=text, + bordercolor=border, + insertcolor=text, + padding=(8, 6), + ) + style.map("TEntry", fieldbackground=[("readonly", "#0b0f14")], foreground=[("readonly", text)]) + style.configure( + "TCombobox", + fieldbackground="#0b0f14", + background=panel_alt, + foreground=text, + bordercolor=border, + arrowcolor=accent, + padding=(8, 6), + ) + style.map( + "TCombobox", + fieldbackground=[("readonly", "#0b0f14")], + foreground=[("readonly", text)], + selectbackground=[("readonly", "#0b0f14")], + selectforeground=[("readonly", text)], + ) + style.configure("TCheckbutton", background=panel, foreground=text, focuscolor=panel, padding=(2, 4)) + style.map("TCheckbutton", background=[("active", panel)], foreground=[("active", text)]) + style.configure( + "TButton", + background="#192231", + foreground=text, + bordercolor=border, + focusthickness=0, + padding=(12, 7), + ) + style.map("TButton", background=[("active", "#243248"), ("pressed", "#0f1722")]) + style.configure( + "Secondary.TButton", + background="#121b28", + foreground="#c9d5e3", + bordercolor=border, + focusthickness=0, + padding=(10, 6), + ) + style.map("Secondary.TButton", background=[("active", "#1d2a3d"), ("pressed", "#0d131d")]) + style.configure( + "Accent.TButton", + background=accent, + foreground="#06101a", + bordercolor=accent, + padding=(12, 7), + ) + style.map("Accent.TButton", background=[("active", accent_active), ("pressed", "#2e83d1")]) + style.configure( + "IconButton.TButton", + background="#192231", + foreground=text, + bordercolor=border, + focusthickness=0, + padding=(8, 5), + font=("Segoe UI Semibold", 10), + ) + style.map("IconButton.TButton", background=[("active", "#243248"), ("pressed", "#0f1722")]) + + def show_about(self) -> None: + about = tk.Toplevel(self) + about.title("About Seshat TTS") + about.transient(self) + about.configure(bg="#07090d") + about.geometry("640x520") + about.minsize(640, 520) + about.maxsize(640, 520) + about.resizable(False, False) + try: + about.iconbitmap(default=str(resource_path("resources/seshat-tts.ico"))) + except tk.TclError: + pass + + frame = ttk.Frame(about, padding=22) + frame.pack(fill=tk.BOTH, expand=True) + frame.rowconfigure(4, weight=1) + frame.columnconfigure(0, weight=1) + + ttk.Label(frame, text="Seshat TTS", style="AboutTitle.TLabel").grid(row=0, column=0, sticky=tk.W) + ttk.Label(frame, text="Maintainer: Scriptriva Inc.", style="AboutSubtitle.TLabel").grid( + row=1, + column=0, + sticky=tk.W, + pady=(8, 0), + ) + ttk.Label( + frame, + text="For support inquiries email: support@scriptriva.com", + style="AboutSubtitle.TLabel", + wraplength=580, + ).grid( + row=2, + column=0, + sticky=tk.W, + pady=(3, 18), + ) + ttk.Label( + frame, + text="Project license: Scriptriva Public Source License 1.0", + style="AboutSection.TLabel", + wraplength=580, + ).grid(row=3, column=0, sticky=tk.W) + + content = ttk.Frame(frame, style="AboutPanel.TFrame", padding=(0, 12, 0, 0)) + content.grid(row=4, column=0, sticky=tk.NSEW) + content.columnconfigure(0, weight=1) + + self._about_link(content, "Open Seshat TTS license", resource_path("LICENSE")) + self._about_link( + content, + "Open Pocket TTS MIT license", + "https://github.com/kyutai-labs/pocket-tts/blob/main/LICENSE", + ) + self._about_link( + content, + "Open third-party notices", + resource_path("THIRD_PARTY_NOTICES.md"), + ) + + ttk.Label(content, text="Reuse Notes", style="AboutSection.TLabel").pack(anchor=tk.W, pady=(18, 6)) + details = ( + "capture.py and ocr.py isolate OCR capture and preprocessing.\n" + "tts.py isolates Pocket TTS playback and stream cancellation.\n" + "llm.py isolates OpenAI-compatible local LLM cleanup.\n" + "config.py isolates persisted GUI/runtime settings.\n\n" + "Third-party components retain their own licenses. See README.md, LICENSE, and THIRD_PARTY_NOTICES.md." + ) + ttk.Label(content, text=details, wraplength=580, justify=tk.LEFT, style="AboutBody.TLabel").pack( + anchor=tk.W, + fill=tk.X, + ) + + footer = ttk.Frame(frame, style="AboutFooter.TFrame") + footer.grid(row=5, column=0, sticky=tk.EW, pady=(18, 0)) + footer.columnconfigure(0, weight=1) + ttk.Button(footer, text="Close", command=about.destroy, style="Accent.TButton").grid(row=0, column=1, sticky=tk.E) + about.update_idletasks() + self._center_child_window(about, 640, 520) + + def _about_link(self, parent: ttk.Frame, text: str, target: str | Path) -> None: + label = ttk.Label(parent, text=text, foreground="#74b8ff", cursor="hand2", style="AboutLink.TLabel") + label.pack(anchor=tk.W, pady=(6, 0)) + label.bind("", lambda _event: self._open_about_target(target)) + + def _open_about_target(self, target: str | Path) -> None: + if isinstance(target, Path): + if target.exists(): + os.startfile(target) + return + webbrowser.open(target) + + def _center_child_window(self, window: tk.Toplevel, width: int, height: int) -> None: + self.update_idletasks() + x = self.winfo_rootx() + max(0, (self.winfo_width() - width) // 2) + y = self.winfo_rooty() + max(0, (self.winfo_height() - height) // 2) + window.geometry(f"{width}x{height}+{x}+{y}") + + def _load_header_icon(self) -> ImageTk.PhotoImage | None: + path = resource_path("resources/logo.png") + if not path.exists(): + return None + try: + image = Image.open(path).convert("RGBA").resize((56, 56), Image.Resampling.LANCZOS) + self._header_icon = ImageTk.PhotoImage(image) + return self._header_icon + except Exception: + return None + + def _number_field(self, parent: ttk.Frame, label: str, variable: tk.IntVar, row: int, column: int) -> None: + ttk.Label(parent, text=label, style="Field.TLabel").grid(row=row, column=column, sticky=tk.W, padx=8, pady=6) + ttk.Entry(parent, textvariable=variable, width=8).grid(row=row, column=column + 1, sticky=tk.EW, padx=8, pady=6) + + def _hotkey_entry(self, parent: ttk.Frame, variable: tk.StringVar, name: str) -> ttk.Entry: + entry = ttk.Entry(parent, textvariable=variable, state="readonly") + entry.bind("", lambda _event: self._start_hotkey_recording(name, variable)) + entry.bind("", lambda _event: self._start_hotkey_recording(name, variable)) + return entry + + def _load_values(self) -> None: + cfg = self.config_model + self.capture_mode_var.set(cfg.capture_mode) + self.hotkey_var.set(cfg.hotkey) + self.capture_region_hotkey_var.set(cfg.capture_region_hotkey) + self.stop_hotkey_var.set(cfg.stop_hotkey) + self.left_var.set(cfg.dialogue_rect.left) + self.top_var.set(cfg.dialogue_rect.top) + self.width_var.set(cfg.dialogue_rect.width) + self.height_var.set(cfg.dialogue_rect.height) + self.tesseract_var.set(cfg.tesseract_cmd) + self.voice_source_var.set(cfg.voice_source) + self.default_voice_var.set(cfg.default_voice) + self._refresh_voice_profiles() + self.custom_voice_name_var.set(cfg.custom_voice_name) + self.voice_var.set(cfg.voice_path) + self.language_var.set("english") + self.quantize_var.set(cfg.quantize_tts) + self.volume_var.set(cfg.volume_gain * 100) + self._update_volume_label() + backend = cfg.tts_backend if cfg.tts_backend in _tts_backend_options() else "uvx-server" + self.tts_backend_var.set(backend) + self.tts_port_var.set(cfg.tts_port) + self.llm_enabled_var.set(cfg.llm_enabled) + self.llm_base_url_var.set(cfg.llm_base_url) + self.llm_api_key_var.set(cfg.llm_api_key) + self.llm_model_var.set(cfg.llm_model) + self.llm_timeout_var.set(cfg.llm_timeout) + self.llm_max_tokens_var.set(cfg.llm_max_tokens) + self.llm_disable_thinking_var.set(cfg.llm_disable_thinking) + self.llm_image_extraction_var.set(cfg.llm_image_extraction) + self.llm_prompt_box.delete("1.0", tk.END) + self.llm_prompt_box.insert("1.0", cfg.llm_system_prompt) + self._set_text(cfg.last_text) + + def _refresh_targets(self) -> None: + self._refresh_monitors() + self._refresh_windows() + + def _refresh_monitors(self) -> None: + monitors = list_monitors() + self._monitor_values = {monitor.label: monitor.index for monitor in monitors} + values = list(self._monitor_values) + self.monitor_box["values"] = values + selected = next((label for label, index in self._monitor_values.items() if index == self.config_model.monitor_index), "") + if not selected and values: + selected = values[0] + self.monitor_var.set(selected) + + def _refresh_windows(self) -> None: + windows = list_visible_windows() + self._window_values = {window.label: window for window in windows} + values = list(self._window_values) + self.window_box["values"] = values + selected = next( + (label for label, window in self._window_values.items() if window.title == self.config_model.window_title), + "", + ) + if not selected: + selected = next( + (label for label, window in self._window_values.items() if "neverwinter nights" in window.title.casefold()), + "", + ) + if not selected and values: + selected = values[0] + self.window_var.set(selected) + + def _browse_tesseract(self) -> None: + path = filedialog.askopenfilename(title="Select tesseract.exe", filetypes=[("Executable", "*.exe"), ("All", "*.*")]) + if path: + self.tesseract_var.set(path) + + def _detect_tesseract(self) -> None: + path = find_tesseract() + if path: + self.tesseract_var.set(path) + self.status_var.set(f"Tesseract found: {path}") + else: + self.status_var.set("Tesseract not found. Install it or browse to tesseract.exe.") + + def _load_llm_api_key_file(self) -> None: + key = load_api_key_file() + if key: + self.llm_api_key_var.set(key) + self.status_var.set("Loaded api_key.txt.") + else: + self.status_var.set("api_key.txt was not found or was empty.") + + def _browse_voice(self) -> None: + path = filedialog.askopenfilename( + title="Select voice audio", + filetypes=[("Audio", "*.wav *.mp3"), ("Wave", "*.wav"), ("MP3", "*.mp3"), ("All", "*.*")], + ) + if path: + self.voice_var.set(path) + self.voice_source_var.set("custom-wav") + name = simpledialog.askstring( + "Custom voice name", + "Name this custom voice:", + initialvalue=Path(path).stem, + parent=self, + ) + if name: + self._save_voice_profile(name.strip(), path) + else: + self.custom_voice_name_var.set(Path(path).stem) + + def _refresh_voice_profiles(self) -> None: + self.voice_profiles = load_voice_profiles() + values = [profile.name for profile in self.voice_profiles] + if hasattr(self, "custom_voice_box"): + self.custom_voice_box["values"] = values + + def _save_voice_profile(self, name: str, path: str) -> None: + if not name: + return + self.voice_profiles = upsert_voice_profile(VoiceProfile(name=name, path=path)) + self._refresh_voice_profiles() + self.custom_voice_name_var.set(name) + self.voice_var.set(path) + + def _apply_selected_voice_profile(self) -> None: + profile = voice_profile_by_name(self.custom_voice_name_var.get(), self.voice_profiles) + if profile is not None: + self.voice_var.set(profile.path) + self.voice_source_var.set("custom-wav") + + def open_voice_manager(self) -> None: + manager = tk.Toplevel(self) + manager.title("Custom Voices") + manager.transient(self) + manager.configure(bg="#07090d") + manager.geometry("700x420") + manager.minsize(620, 360) + try: + manager.iconbitmap(default=str(resource_path("resources/seshat-tts.ico"))) + except tk.TclError: + pass + + frame = ttk.Frame(manager, padding=14) + frame.pack(fill=tk.BOTH, expand=True) + frame.columnconfigure(1, weight=1) + frame.rowconfigure(0, weight=1) + + voice_list = tk.Listbox( + frame, + bg="#0b0f14", + fg="#f4f7fb", + selectbackground="#255f99", + selectforeground="#ffffff", + highlightthickness=1, + highlightbackground="#263241", + relief=tk.FLAT, + exportselection=False, + ) + voice_list.grid(row=0, column=0, rowspan=5, sticky=tk.NSEW, padx=(0, 12)) + frame.columnconfigure(0, minsize=200) + + name_var = tk.StringVar() + path_var = tk.StringVar() + ttk.Label(frame, text="Name").grid(row=0, column=1, sticky=tk.SW, pady=(0, 4)) + ttk.Entry(frame, textvariable=name_var).grid(row=1, column=1, sticky=tk.EW, pady=(0, 10)) + ttk.Label(frame, text="WAV, MP3, or cached safetensors").grid(row=2, column=1, sticky=tk.SW, pady=(0, 4)) + ttk.Entry(frame, textvariable=path_var).grid(row=3, column=1, sticky=tk.EW) + ttk.Button( + frame, + text="Browse", + command=lambda: self._browse_voice_for_manager(path_var, name_var), + ).grid(row=3, column=2, padx=(8, 0)) + + def refresh_list() -> None: + self._refresh_voice_profiles() + voice_list.delete(0, tk.END) + for profile in self.voice_profiles: + voice_list.insert(tk.END, profile.name) + + def on_select(_event: tk.Event | None = None) -> None: + selection = voice_list.curselection() + if not selection: + return + profile = self.voice_profiles[selection[0]] + name_var.set(profile.name) + path_var.set(profile.path) + + def save_current() -> None: + name = name_var.get().strip() + path = path_var.get().strip() + if not name or not path: + messagebox.showerror("Custom voice", "Name and path are required.", parent=manager) + return + self._save_voice_profile(name, path) + refresh_list() + + def use_current() -> None: + save_current() + self.custom_voice_name_var.set(name_var.get().strip()) + self.voice_var.set(path_var.get().strip()) + self.voice_source_var.set("custom-wav") + manager.destroy() + + def delete_current() -> None: + selection = voice_list.curselection() + if not selection: + return + selected_name = self.voice_profiles[selection[0]].name + self.voice_profiles = [profile for profile in self.voice_profiles if profile.name != selected_name] + save_voice_profiles(self.voice_profiles) + refresh_list() + name_var.set("") + path_var.set("") + + buttons = ttk.Frame(frame) + buttons.grid(row=4, column=1, columnspan=2, sticky=tk.EW, pady=(16, 0)) + ttk.Button(buttons, text="Save", command=save_current).pack(side=tk.LEFT) + ttk.Button(buttons, text="Use Selected", command=use_current, style="Accent.TButton").pack(side=tk.LEFT, padx=8) + ttk.Button(buttons, text="Delete", command=delete_current).pack(side=tk.LEFT) + ttk.Button(buttons, text="Close", command=manager.destroy).pack(side=tk.RIGHT) + + voice_list.bind("<>", on_select) + refresh_list() + + def _browse_voice_for_manager(self, path_var: tk.StringVar, name_var: tk.StringVar) -> None: + path = filedialog.askopenfilename( + title="Select voice audio or cache", + filetypes=[ + ("Voice files", "*.wav *.mp3 *.safetensors"), + ("Audio", "*.wav *.mp3"), + ("Cached voice", "*.safetensors"), + ("All", "*.*"), + ], + ) + if path: + path_var.set(path) + if not name_var.get().strip(): + name_var.set(Path(path).stem) + + def _read_config_from_ui(self) -> AppConfig: + monitor_index = self._monitor_values.get(self.monitor_var.get(), self.config_model.monitor_index) + selected_window = self._window_values.get(self.window_var.get()) + return AppConfig( + capture_mode=self.capture_mode_var.get().strip() or "monitor", + monitor_index=monitor_index, + window_title=selected_window.title if selected_window else self.config_model.window_title, + hotkey=self.hotkey_var.get().strip() or "ctrl+alt+n", + capture_region_hotkey=self.capture_region_hotkey_var.get().strip() or "ctrl+alt+r", + stop_hotkey=self.stop_hotkey_var.get().strip() or "ctrl+alt+s", + dialogue_rect=Rect( + left=self.left_var.get(), + top=self.top_var.get(), + width=self.width_var.get(), + height=self.height_var.get(), + ), + tesseract_cmd=self.tesseract_var.get().strip(), + voice_source=self.voice_source_var.get().strip() or "default", + default_voice=self.default_voice_var.get().strip() or "alba", + custom_voice_name=self.custom_voice_name_var.get().strip(), + voice_path=self.voice_var.get().strip(), + language="english", + quantize_tts=self.quantize_var.get(), + volume_gain=self.volume_var.get() / 100, + tts_backend=self.tts_backend_var.get().strip() or "uvx-server", + tts_host="localhost", + tts_port=self.tts_port_var.get(), + llm_enabled=self.llm_enabled_var.get(), + llm_base_url=self.llm_base_url_var.get().strip() or "http://127.0.0.1:8000/v1", + llm_api_key=self.llm_api_key_var.get().strip(), + llm_model=self.llm_model_var.get().strip() or "unsloth", + llm_timeout=self.llm_timeout_var.get(), + llm_max_tokens=self.llm_max_tokens_var.get(), + llm_disable_thinking=self.llm_disable_thinking_var.get(), + llm_image_extraction=self.llm_image_extraction_var.get(), + llm_system_prompt=self.llm_prompt_box.get("1.0", tk.END).strip(), + last_text=self.text_box.get("1.0", tk.END).strip(), + ) + + def _save_and_register(self) -> None: + try: + self.config_model = self._read_config_from_ui() + save_config(self.config_model) + self._register_hotkey() + self.tts = None + self.status_var.set("Settings saved.") + except Exception as exc: + messagebox.showerror("Settings error", str(exc)) + + def _register_hotkey(self) -> None: + try: + self.hotkeys.register("read", self.config_model.hotkey, self.capture_now) + self.hotkeys.register( + "capture-region", + self.config_model.capture_region_hotkey, + lambda: self.after(0, self.select_region), + ) + self.hotkeys.register( + "stop-playback", + self.config_model.stop_hotkey, + lambda: self.after(0, self.stop_playback), + ) + self.status_var.set( + "Listening for " + f"{self.config_model.hotkey}, " + f"{self.config_model.capture_region_hotkey}, " + f"{self.config_model.stop_hotkey}." + ) + except Exception as exc: + self.status_var.set(f"Hotkey error: {exc}") + + def _start_hotkey_recording(self, name: str, variable: tk.StringVar) -> None: + if self._recording_hotkey: + return + self._recording_hotkey = True + self.hotkeys.unregister() + variable.set("Press keys...") + self.status_var.set(f"Listening for {name} hotkey input...") + threading.Thread(target=self._record_hotkey_worker, args=(variable,), daemon=True).start() + + def _record_hotkey_worker(self, variable: tk.StringVar) -> None: + try: + hotkey = listen_for_hotkey() + self.after(0, lambda: self._finish_hotkey_recording(variable, hotkey)) + except Exception as exc: + self.after(0, lambda: self._cancel_hotkey_recording(str(exc))) + + def _finish_hotkey_recording(self, variable: tk.StringVar, hotkey: str) -> None: + variable.set(hotkey) + self._recording_hotkey = False + self._save_and_register() + self.status_var.set(f"Hotkey set to {hotkey}.") + + def _cancel_hotkey_recording(self, reason: str) -> None: + self._recording_hotkey = False + self._register_hotkey() + self.status_var.set(f"Hotkey recording failed: {reason}") + + def capture_now(self) -> None: + if self._capture_lock.locked(): + return + threading.Thread(target=self._capture_worker, daemon=True).start() + + def stop_playback(self) -> None: + if self.tts is not None: + self.tts.stop() + self.status_var.set("Playback stopped.") + + def select_region(self) -> None: + try: + cfg = self._read_config_from_ui() + bounds = self._target_bounds(cfg) + self.status_var.set("Drag over the capture region. Press Escape to cancel.") + RegionPicker(self, bounds, self._apply_selected_region) + except Exception as exc: + messagebox.showerror("Region selection error", str(exc)) + + def _apply_selected_region(self, rect: Rect) -> None: + relative = self._relative_to_target(rect, self._read_config_from_ui()) + self.left_var.set(relative.left) + self.top_var.set(relative.top) + self.width_var.set(relative.width) + self.height_var.set(relative.height) + self._save_and_register() + self.status_var.set(f"Selected region {relative.width}x{relative.height} at {relative.left},{relative.top}.") + + def preload_tts(self) -> None: + try: + cfg = self._read_config_from_ui() + self.config_model = cfg + save_config(cfg) + self._ensure_tts(cfg).preload_async() + self.status_var.set("TTS preload started.") + except Exception as exc: + messagebox.showerror("TTS preload error", str(exc)) + + def test_tts(self) -> None: + try: + cfg = self._read_config_from_ui() + self.config_model = cfg + save_config(cfg) + self._ensure_tts(cfg).test_async() + self.status_var.set("TTS test started.") + except Exception as exc: + messagebox.showerror("TTS test error", str(exc)) + + def _capture_worker(self) -> None: + with self._capture_lock: + try: + cfg = self._read_config_from_ui() + if self.tts is not None: + self.tts.stop() + self._set_status("Capturing selected region...") + image = self._capture_image(cfg) + if cfg.llm_image_extraction: + self._set_status("Extracting text from image with local LLM...") + text = process_image_with_llm( + image, + base_url=cfg.llm_base_url, + api_key=cfg.llm_api_key, + model=cfg.llm_model, + timeout=cfg.llm_timeout, + max_tokens=cfg.llm_max_tokens, + disable_thinking=cfg.llm_disable_thinking, + ) + else: + self._set_status("Running OCR...") + text = extract_ocr_text(image, cfg.tesseract_cmd) + if cfg.llm_enabled: + self._set_status("Routing OCR text through local LLM...") + text = process_text_with_llm( + text, + enabled=True, + base_url=cfg.llm_base_url, + api_key=cfg.llm_api_key, + model=cfg.llm_model, + system_prompt=cfg.llm_system_prompt, + timeout=cfg.llm_timeout, + max_tokens=cfg.llm_max_tokens, + disable_thinking=cfg.llm_disable_thinking, + ) + if not text: + self._set_status("No dialogue text found.") + return + cfg.last_text = text + self.config_model = cfg + save_config(cfg) + self.after(0, lambda: self._set_text(text)) + self._ensure_tts(cfg).speak_async(text) + except Exception as exc: + self._set_status(f"Capture error: {exc}") + + def _capture_image(self, cfg: AppConfig): + if cfg.capture_mode == "window": + selected = self._window_values.get(self.window_var.get()) + window = selected or find_window_by_title(cfg.window_title) + if window is None: + raise ValueError("Selected window is not available. Refresh windows and select it again.") + return capture_window_region(window.hwnd, cfg.dialogue_rect) + return capture_monitor_region(cfg.monitor_index, cfg.dialogue_rect) + + def _target_bounds(self, cfg: AppConfig) -> Rect: + if cfg.capture_mode == "window": + selected = self._window_values.get(self.window_var.get()) + window = selected or find_window_by_title(cfg.window_title) + if window is None: + raise ValueError("Selected window is not available. Refresh windows and select it again.") + return Rect(left=window.left, top=window.top, width=window.width, height=window.height) + monitor_index = self._monitor_values.get(self.monitor_var.get(), cfg.monitor_index) + monitor = next((item for item in list_monitors() if item.index == monitor_index), None) + if monitor is None: + raise ValueError(f"Monitor {monitor_index} is not available.") + return Rect(left=monitor.left, top=monitor.top, width=monitor.width, height=monitor.height) + + def _relative_to_target(self, absolute: Rect, cfg: AppConfig) -> Rect: + target = self._target_bounds(cfg) + return Rect( + left=absolute.left - target.left, + top=absolute.top - target.top, + width=absolute.width, + height=absolute.height, + ) + + def _ensure_tts(self, cfg: AppConfig) -> PocketTTSStreamer | UvxPocketTTSServer: + voice_path: str | Path = cfg.voice_path.strip() + if voice_path: + path = Path(voice_path) + voice_path = path if path.is_absolute() else Path.cwd() / path + if self.tts is None: + if cfg.tts_backend == "python-api": + self.tts = PocketTTSStreamer( + voice_path, + cfg.language, + cfg.quantize_tts, + cfg.voice_source, + cfg.default_voice, + cfg.custom_voice_name, + cfg.volume_gain, + ) + else: + self.tts = UvxPocketTTSServer( + voice_path, + cfg.language, + cfg.quantize_tts, + cfg.tts_host, + cfg.tts_port, + cfg.voice_source, + cfg.default_voice, + cfg.custom_voice_name, + cfg.volume_gain, + ) + else: + self.tts.volume_gain = cfg.volume_gain + return self.tts + + def _update_volume_label(self) -> None: + if hasattr(self, "volume_label_var"): + self.volume_label_var.set(f"{int(round(self.volume_var.get()))}%") + + def _set_text(self, text: str) -> None: + self.text_box.delete("1.0", tk.END) + self.text_box.insert("1.0", text) + + def _set_status(self, status: str) -> None: + self.after(0, lambda: self.status_var.set(status)) + + def _poll_tts_status(self) -> None: + if self.tts is not None: + while not self.tts.status_queue.empty(): + status = self.tts.status_queue.get_nowait() + self.status_var.set(status.splitlines()[0]) + if "\n" in status or len(status) > 180: + self._set_text(status) + self.after(250, self._poll_tts_status) + + def _on_close(self) -> None: + self.hotkeys.unregister() + if self.tts is not None: + self.tts.close() + self.destroy() + + +def main() -> None: + app = SeshatTtsApp() + app.mainloop() + + +def _set_windows_app_user_model_id() -> None: + if sys.platform != "win32": + return + try: + ctypes.windll.shell32.SetCurrentProcessExplicitAppUserModelID("Seshat.TTS.Desktop") + except Exception: + pass + + +def _tts_backend_options() -> list[str]: + if getattr(sys, "frozen", False): + return ["uvx-server"] + return ["uvx-server", "python-api"] + + +if __name__ == "__main__": + main() diff --git a/src/seshat_tts/capture.py b/src/seshat_tts/capture.py new file mode 100644 index 0000000..08bd3b1 --- /dev/null +++ b/src/seshat_tts/capture.py @@ -0,0 +1,116 @@ +from __future__ import annotations + +import ctypes +from dataclasses import dataclass + +import mss +from PIL import Image +import win32gui +import win32ui + +from .config import Rect + + +@dataclass(frozen=True, slots=True) +class MonitorInfo: + index: int + left: int + top: int + width: int + height: int + + @property + def label(self) -> str: + return f"{self.index}: {self.width}x{self.height} at {self.left},{self.top}" + + +def list_monitors() -> list[MonitorInfo]: + with mss.mss() as sct: + return [ + MonitorInfo( + index=index, + left=int(monitor["left"]), + top=int(monitor["top"]), + width=int(monitor["width"]), + height=int(monitor["height"]), + ) + for index, monitor in enumerate(sct.monitors) + if index != 0 + ] + + +def capture_absolute_region(left: int, top: int, width: int, height: int) -> Image.Image: + with mss.mss() as sct: + grab = { + "left": left, + "top": top, + "width": width, + "height": height, + } + shot = sct.grab(grab) + return Image.frombytes("RGB", shot.size, shot.rgb) + + +def capture_monitor_region(monitor_index: int, rect: Rect) -> Image.Image: + with mss.mss() as sct: + if monitor_index <= 0 or monitor_index >= len(sct.monitors): + raise ValueError(f"Monitor {monitor_index} is not available.") + monitor = sct.monitors[monitor_index] + return capture_absolute_region( + int(monitor["left"]) + rect.left, + int(monitor["top"]) + rect.top, + rect.width, + rect.height, + ) + + +def capture_window_region(hwnd: int, rect: Rect) -> Image.Image: + image = capture_window(hwnd) + if rect.left < 0 or rect.top < 0 or rect.width <= 0 or rect.height <= 0: + raise ValueError("Capture region must be inside the selected window.") + if rect.left + rect.width > image.width or rect.top + rect.height > image.height: + raise ValueError("Capture region is outside the selected window. Select the region again in window mode.") + return image.crop((rect.left, rect.top, rect.left + rect.width, rect.top + rect.height)) + + +def capture_window(hwnd: int) -> Image.Image: + left, top, right, bottom = win32gui.GetWindowRect(hwnd) + width = right - left + height = bottom - top + if width <= 0 or height <= 0: + raise ValueError("Selected window has no capturable size.") + + hwnd_dc = win32gui.GetWindowDC(hwnd) + source_dc = win32ui.CreateDCFromHandle(hwnd_dc) + memory_dc = source_dc.CreateCompatibleDC() + bitmap = win32ui.CreateBitmap() + bitmap.CreateCompatibleBitmap(source_dc, width, height) + memory_dc.SelectObject(bitmap) + + try: + result = _print_window(hwnd, memory_dc.GetSafeHdc(), 2) + if result != 1: + result = _print_window(hwnd, memory_dc.GetSafeHdc(), 0) + if result != 1: + raise RuntimeError("PrintWindow failed for the selected window.") + + info = bitmap.GetInfo() + bits = bitmap.GetBitmapBits(True) + return Image.frombuffer( + "RGB", + (info["bmWidth"], info["bmHeight"]), + bits, + "raw", + "BGRX", + 0, + 1, + ).copy() + finally: + win32gui.DeleteObject(bitmap.GetHandle()) + memory_dc.DeleteDC() + source_dc.DeleteDC() + win32gui.ReleaseDC(hwnd, hwnd_dc) + + +def _print_window(hwnd: int, hdc: int, flags: int) -> int: + return int(ctypes.windll.user32.PrintWindow(hwnd, hdc, flags)) diff --git a/src/seshat_tts/config.py b/src/seshat_tts/config.py new file mode 100644 index 0000000..8aade7b --- /dev/null +++ b/src/seshat_tts/config.py @@ -0,0 +1,125 @@ +from __future__ import annotations + +import json +import sys +from dataclasses import asdict, dataclass, field +from pathlib import Path +from typing import Any + +from .tesseract import find_tesseract + + +APP_DIR = Path.home() / ".seshat-tts" +CONFIG_PATH = APP_DIR / "config.json" + + +@dataclass(slots=True) +class Rect: + left: int = 0 + top: int = 25 + width: int = 720 + height: int = 305 + + +@dataclass(slots=True) +class AppConfig: + capture_mode: str = "monitor" + monitor_index: int = 1 + window_title: str = "" + hotkey: str = "ctrl+alt+n" + capture_region_hotkey: str = "ctrl+alt+r" + stop_hotkey: str = "ctrl+alt+s" + dialogue_rect: Rect = field(default_factory=Rect) + tesseract_cmd: str = field(default_factory=find_tesseract) + voice_source: str = "default" + default_voice: str = "alba" + custom_voice_name: str = "" + voice_path: str = "" + language: str = "english" + quantize_tts: bool = False + volume_gain: float = 1.0 + tts_backend: str = "uvx-server" + tts_host: str = "localhost" + tts_port: int = 8000 + llm_enabled: bool = False + llm_base_url: str = "http://127.0.0.1:8000/v1" + llm_api_key: str = "" + llm_model: str = "current" + llm_timeout: float = 5.0 + llm_max_tokens: int = 256 + llm_disable_thinking: bool = True + llm_image_extraction: bool = False + llm_system_prompt: str = ( + "Clean OCR text for text-to-speech. Return only the corrected text. " + "Do not explain, add commentary, summarize, or change the meaning." + ) + last_text: str = "" + + +def _rect_from_dict(value: dict[str, Any] | None) -> Rect: + if not value: + return Rect() + return Rect(**{field: int(value.get(field, getattr(Rect(), field))) for field in Rect.__dataclass_fields__}) + + +def _clean_last_text(value: Any) -> str: + lines = str(value or "").splitlines() + cleaned = [ + line + for line in lines + if not line.strip().casefold().startswith(("capture region:", "text region:")) + ] + return "\n".join(cleaned).strip() + + +def _tesseract_from_config(value: Any) -> str: + detected = find_tesseract() + if getattr(sys, "frozen", False) and detected: + return detected + return str(value or detected) + + +def load_config(path: Path = CONFIG_PATH) -> AppConfig: + if not path.exists(): + return AppConfig() + data = json.loads(path.read_text(encoding="utf-8")) + return AppConfig( + capture_mode=str(data.get("capture_mode", "monitor")), + monitor_index=int(data.get("monitor_index", 1)), + window_title=str(data.get("window_title", "")), + hotkey=str(data.get("hotkey", "ctrl+alt+n")), + capture_region_hotkey=str(data.get("capture_region_hotkey", "ctrl+alt+r")), + stop_hotkey=str(data.get("stop_hotkey", "ctrl+alt+s")), + dialogue_rect=_rect_from_dict(data.get("dialogue_rect")), + tesseract_cmd=_tesseract_from_config(data.get("tesseract_cmd")), + voice_source=str(data.get("voice_source", "default")), + default_voice=str(data.get("default_voice", "alba")), + custom_voice_name=str(data.get("custom_voice_name", "")), + voice_path=str(data.get("voice_path", "")), + language="english", + quantize_tts=bool(data.get("quantize_tts", False)), + volume_gain=float(data.get("volume_gain", 1.0)), + tts_backend=str(data.get("tts_backend", "uvx-server")), + tts_host=str(data.get("tts_host", "localhost")), + tts_port=int(data.get("tts_port", 8000)), + llm_enabled=bool(data.get("llm_enabled", False)), + llm_base_url=str(data.get("llm_base_url", "http://127.0.0.1:8000/v1")), + llm_api_key=str(data.get("llm_api_key", "")), + llm_model=str(data.get("llm_model", "unsloth")), + llm_timeout=float(data.get("llm_timeout", 5.0)), + llm_max_tokens=int(data.get("llm_max_tokens", 256)), + llm_disable_thinking=bool(data.get("llm_disable_thinking", True)), + llm_image_extraction=bool(data.get("llm_image_extraction", False)), + llm_system_prompt=str( + data.get( + "llm_system_prompt", + AppConfig.__dataclass_fields__["llm_system_prompt"].default, + ) + ), + last_text=_clean_last_text(data.get("last_text", "")), + ) + + +def save_config(config: AppConfig, path: Path = CONFIG_PATH) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(json.dumps(asdict(config), indent=2), encoding="utf-8") diff --git a/src/seshat_tts/hotkeys.py b/src/seshat_tts/hotkeys.py new file mode 100644 index 0000000..f9654be --- /dev/null +++ b/src/seshat_tts/hotkeys.py @@ -0,0 +1,30 @@ +from __future__ import annotations + +from collections.abc import Callable + +import keyboard + + +class HotkeyManager: + def __init__(self) -> None: + self._handles: dict[str, object] = {} + + def register(self, name: str, hotkey: str, callback: Callable[[], None]) -> None: + self.unregister(name) + if not hotkey.strip(): + return + self._handles[name] = keyboard.add_hotkey(hotkey, callback, suppress=False, trigger_on_release=False) + + def unregister(self, name: str | None = None) -> None: + if name is not None: + handle = self._handles.pop(name, None) + if handle is not None: + keyboard.remove_hotkey(handle) + return + for handle in self._handles.values(): + keyboard.remove_hotkey(handle) + self._handles.clear() + + +def listen_for_hotkey() -> str: + return keyboard.read_hotkey(suppress=False) diff --git a/src/seshat_tts/llm.py b/src/seshat_tts/llm.py new file mode 100644 index 0000000..2eec8df --- /dev/null +++ b/src/seshat_tts/llm.py @@ -0,0 +1,143 @@ +from __future__ import annotations + +import base64 +from io import BytesIO +from pathlib import Path +from typing import Protocol + +from PIL import Image + + +DEFAULT_API_KEY_PATH = Path.home() / ".seshat-tts" / "llm_api_key.txt" +IMAGE_EXTRACTION_SYSTEM_PROMPT = ( + "Extract only the visible readable text from the supplied image for text-to-speech. " + "Preserve the original wording and sentence order. Do not describe the image, " + "do not add commentary, and do not include UI labels unless they are part of the text to read." +) +IMAGE_EXTRACTION_USER_PROMPT = "Read the text in this selected screen region and return only that text." + + +class _ChatCompletions(Protocol): + def create(self, **kwargs: object) -> object: ... + + +class _Chat(Protocol): + completions: _ChatCompletions + + +class _OpenAIClient(Protocol): + chat: _Chat + + +def load_api_key_file(path: Path = DEFAULT_API_KEY_PATH) -> str: + if not path.exists(): + return "" + return path.read_text(encoding="utf-8").strip() + + +def process_text_with_llm( + text: str, + *, + enabled: bool, + base_url: str, + api_key: str, + model: str, + system_prompt: str, + timeout: float = 5.0, + max_tokens: int = 256, + disable_thinking: bool = True, + client: _OpenAIClient | None = None, +) -> str: + text = text.strip() + if not enabled or not text: + return text + + if client is None: + from openai import OpenAI + + client = OpenAI( + api_key=api_key.strip() or "local", + base_url=base_url.strip(), + timeout=max(0.1, float(timeout)), + ) + + request: dict[str, object] = { + "model": model.strip(), + "messages": [ + {"role": "system", "content": system_prompt.strip()}, + {"role": "user", "content": text}, + ], + "temperature": 0, + "max_tokens": max(1, int(max_tokens)), + "stream": False, + } + if disable_thinking: + request["extra_body"] = { + "chat_template_kwargs": {"enable_thinking": False}, + "enable_thinking": False, + "reasoning_effort": "none", + } + + response = client.chat.completions.create(**request) + content = response.choices[0].message.content + return str(content or "").strip() or text + + +def process_image_with_llm( + image: Image.Image, + *, + base_url: str, + api_key: str, + model: str, + timeout: float = 5.0, + max_tokens: int = 256, + disable_thinking: bool = True, + client: _OpenAIClient | None = None, +) -> str: + if client is None: + from openai import OpenAI + + client = OpenAI( + api_key=api_key.strip() or "local", + base_url=base_url.strip(), + timeout=max(0.1, float(timeout)), + ) + + request: dict[str, object] = { + "model": model.strip(), + "messages": [ + {"role": "system", "content": IMAGE_EXTRACTION_SYSTEM_PROMPT}, + { + "role": "user", + "content": [ + {"type": "text", "text": IMAGE_EXTRACTION_USER_PROMPT}, + { + "type": "image_url", + "image_url": { + "url": f"data:image/png;base64,{_image_to_base64_png(image)}", + "detail": "high", + }, + }, + ], + }, + ], + "temperature": 0, + "max_tokens": max(1, int(max_tokens)), + "stream": False, + } + if disable_thinking: + request["extra_body"] = { + "chat_template_kwargs": {"enable_thinking": False}, + "enable_thinking": False, + "reasoning_effort": "none", + } + + response = client.chat.completions.create(**request) + content = response.choices[0].message.content + return str(content or "").strip() + + +def _image_to_base64_png(image: Image.Image) -> str: + buffer = BytesIO() + image.convert("RGB").save(buffer, format="PNG") + return base64.b64encode(buffer.getvalue()).decode("ascii") diff --git a/src/seshat_tts/ocr.py b/src/seshat_tts/ocr.py new file mode 100644 index 0000000..88a87ac --- /dev/null +++ b/src/seshat_tts/ocr.py @@ -0,0 +1,60 @@ +from __future__ import annotations + +import os +from pathlib import Path + +from PIL import Image, ImageEnhance, ImageFilter, ImageOps + +from .tesseract import tesseract_help_message + + +def preprocess_for_ocr(image: Image.Image) -> Image.Image: + image = ImageOps.expand(image, border=12, fill=(0, 0, 0)) + gray = ImageOps.grayscale(image) + enlarged = gray.resize((gray.width * 2, gray.height * 2), Image.Resampling.LANCZOS) + contrast = ImageEnhance.Contrast(enlarged).enhance(2.2) + sharpened = contrast.filter(ImageFilter.SHARPEN) + return sharpened.point(lambda pixel: 255 if pixel > 145 else 0) + + +def image_to_lines(image: Image.Image, tesseract_cmd: str = "") -> list[str]: + import pytesseract + from pytesseract import TesseractNotFoundError + + if tesseract_cmd: + pytesseract.pytesseract.tesseract_cmd = tesseract_cmd + tessdata = _tessdata_dir(tesseract_cmd) + if tessdata is not None: + os.environ["TESSDATA_PREFIX"] = str(tessdata) + config = "--psm 6 --oem 3" + try: + text = pytesseract.image_to_string(image, lang="eng", config=config) + except TesseractNotFoundError as exc: + raise RuntimeError(tesseract_help_message()) from exc + return [normalize_line(line) for line in text.splitlines() if normalize_line(line)] + + +def normalize_line(line: str) -> str: + import re + + line = re.sub(r"\s+", " ", line).strip() + line = line.replace("“", '"').replace("”", '"').replace("‘", "'").replace("’", "'") + return line + + +def extract_text_from_lines(lines: list[str]) -> str: + return " ".join(lines).strip() + + +def extract_ocr_text(image: Image.Image, tesseract_cmd: str = "") -> str: + processed = preprocess_for_ocr(image) + return extract_text_from_lines(image_to_lines(processed, tesseract_cmd)) + + +def _tessdata_dir(tesseract_cmd: str) -> Path | None: + if not tesseract_cmd: + return None + tessdata = Path(tesseract_cmd).resolve().parent / "tessdata" + if tessdata.exists(): + return tessdata + return None diff --git a/src/seshat_tts/region_picker.py b/src/seshat_tts/region_picker.py new file mode 100644 index 0000000..1148c1f --- /dev/null +++ b/src/seshat_tts/region_picker.py @@ -0,0 +1,100 @@ +from __future__ import annotations + +import tkinter as tk +from collections.abc import Callable + +from PIL import ImageEnhance, ImageTk + +from .capture import capture_absolute_region +from .config import Rect + + +class RegionPicker(tk.Toplevel): + def __init__( + self, + parent: tk.Tk, + bounds: Rect, + on_selected: Callable[[Rect], None], + ) -> None: + super().__init__(parent) + self._bounds = bounds + self._on_selected = on_selected + self._start_x = 0 + self._start_y = 0 + self._rect_id: int | None = None + self._label_id: int | None = None + + self.overrideredirect(True) + self.attributes("-topmost", True) + self.geometry(f"{bounds.width}x{bounds.height}{bounds.left:+d}{bounds.top:+d}") + self.configure(cursor="crosshair") + + screenshot = capture_absolute_region(bounds.left, bounds.top, bounds.width, bounds.height) + dimmed = ImageEnhance.Brightness(screenshot).enhance(0.55) + self._image = ImageTk.PhotoImage(dimmed) + + self.canvas = tk.Canvas(self, bg="#050608", highlightthickness=0, cursor="crosshair") + self.canvas.pack(fill=tk.BOTH, expand=True) + self.canvas.create_image(0, 0, image=self._image, anchor=tk.NW) + self.canvas.create_text( + 18, + 18, + text="Drag to select. Esc cancels.", + fill="#f4f7fb", + anchor=tk.NW, + font=("Segoe UI", 12, "bold"), + ) + self.canvas.bind("", self._on_press) + self.canvas.bind("", self._on_drag) + self.canvas.bind("", self._on_release) + self.bind("", lambda _event: self.destroy()) + + self.focus_force() + self.grab_set() + + def _on_press(self, event: tk.Event) -> None: + self._start_x = int(event.x) + self._start_y = int(event.y) + self._rect_id = self.canvas.create_rectangle( + self._start_x, + self._start_y, + self._start_x, + self._start_y, + outline="#ff365f", + width=3, + ) + + def _on_drag(self, event: tk.Event) -> None: + if self._rect_id is not None: + x1, x2 = sorted((self._start_x, int(event.x))) + y1, y2 = sorted((self._start_y, int(event.y))) + self.canvas.coords(self._rect_id, x1, y1, x2, y2) + label = f"{x2 - x1} x {y2 - y1}" + if self._label_id is None: + self._label_id = self.canvas.create_text( + x1 + 8, + max(12, y1 - 18), + text=label, + fill="#f4f7fb", + anchor=tk.W, + font=("Segoe UI", 10, "bold"), + ) + else: + self.canvas.coords(self._label_id, x1 + 8, max(12, y1 - 18)) + self.canvas.itemconfigure(self._label_id, text=label) + + def _on_release(self, event: tk.Event) -> None: + x1, x2 = sorted((self._start_x, int(event.x))) + y1, y2 = sorted((self._start_y, int(event.y))) + self.grab_release() + self.destroy() + if x2 - x1 < 4 or y2 - y1 < 4: + return + self._on_selected( + Rect( + left=self._bounds.left + x1, + top=self._bounds.top + y1, + width=x2 - x1, + height=y2 - y1, + ) + ) diff --git a/src/seshat_tts/resources.py b/src/seshat_tts/resources.py new file mode 100644 index 0000000..2dcf503 --- /dev/null +++ b/src/seshat_tts/resources.py @@ -0,0 +1,9 @@ +from __future__ import annotations + +import sys +from pathlib import Path + + +def resource_path(relative_path: str) -> Path: + base = Path(getattr(sys, "_MEIPASS", Path(__file__).resolve().parents[2])) + return base / relative_path diff --git a/src/seshat_tts/tesseract.py b/src/seshat_tts/tesseract.py new file mode 100644 index 0000000..c039d0c --- /dev/null +++ b/src/seshat_tts/tesseract.py @@ -0,0 +1,33 @@ +from __future__ import annotations + +import shutil +from pathlib import Path + +from .resources import resource_path + + +COMMON_TESSERACT_PATHS = ( + Path(r"C:\Program Files\Tesseract-OCR\tesseract.exe"), + Path(r"C:\Program Files (x86)\Tesseract-OCR\tesseract.exe"), +) + + +def find_tesseract() -> str: + bundled = resource_path("tesseract/tesseract.exe") + if bundled.exists(): + return str(bundled) + from_path = shutil.which("tesseract") + if from_path: + return from_path + for path in COMMON_TESSERACT_PATHS: + if path.exists(): + return str(path) + return "" + + +def tesseract_help_message() -> str: + return ( + "Tesseract OCR is not installed or the executable is not configured. " + "Install it with `winget install UB-Mannheim.TesseractOCR`, then restart the app, " + "or select tesseract.exe in the GUI." + ) diff --git a/src/seshat_tts/tts.py b/src/seshat_tts/tts.py new file mode 100644 index 0000000..982dcd8 --- /dev/null +++ b/src/seshat_tts/tts.py @@ -0,0 +1,576 @@ +from __future__ import annotations + +import collections +import functools +import hashlib +import http.server +import importlib +import os +import queue +import shutil +import subprocess +import threading +import time +from pathlib import Path +from urllib.parse import quote, urljoin + +import numpy as np +import requests +import sounddevice as sd + +from .resources import resource_path +from .voices import safe_voice_slug + + +VOICE_CACHE_DIR = Path.home() / ".seshat-tts" / "voices" + + +class PocketTTSStreamer: + def __init__( + self, + voice_path: str | Path, + language: str = "english", + quantize: bool = False, + voice_source: str = "default", + default_voice: str = "alba", + custom_voice_name: str = "", + volume_gain: float = 1.0, + ) -> None: + self.voice_path = str(voice_path) + self.language = language + self.quantize = quantize + self.voice_source = voice_source + self.default_voice = default_voice + self.custom_voice_name = custom_voice_name + self.volume_gain = _clamp_volume_gain(volume_gain) + self._model = None + self._voice_state = None + self._lock = threading.Lock() + self._cancel_lock = threading.Lock() + self._cancel_event = threading.Event() + self._status_queue: queue.Queue[str] = queue.Queue() + + @property + def status_queue(self) -> queue.Queue[str]: + return self._status_queue + + def speak_async(self, text: str) -> None: + text = text.strip() + if not text: + return + cancel_event = self._begin_new_stream() + threading.Thread(target=self._speak, args=(text, cancel_event), daemon=True).start() + + def preload_async(self) -> None: + threading.Thread(target=self._preload, daemon=True).start() + + def test_async(self) -> None: + self.speak_async("This is a Pocket TTS test.") + + def close(self) -> None: + self.stop() + + def stop(self) -> None: + with self._cancel_lock: + self._cancel_event.set() + + def _begin_new_stream(self) -> threading.Event: + with self._cancel_lock: + self._cancel_event.set() + self._cancel_event = threading.Event() + return self._cancel_event + + def _preload(self) -> None: + with self._lock: + try: + self._load() + except Exception as exc: + self._status_queue.put(f"TTS preload error: {exc}") + + def _load(self) -> None: + if self._model is not None and self._voice_state is not None: + return + self._status_queue.put("Loading Pocket TTS model...") + try: + pocket_tts = importlib.import_module("pocket_tts") + tts_model = getattr(pocket_tts, "TTSModel") + except (ImportError, OSError) as exc: + raise RuntimeError( + "Pocket TTS failed to load through the in-process Python API. " + "Use the uvx-server backend, especially from the bundled EXE." + ) from exc + + try: + self._model = tts_model.load_model(language=self.language, quantize=self.quantize) + except OSError as exc: + raise RuntimeError( + "Pocket TTS/Torch DLL initialization failed in the in-process Python API. " + "Use the uvx-server backend instead." + ) from exc + voice = self.default_voice if self.voice_source == "default" else self._custom_voice_path() + self._status_queue.put(f"Loading voice: {voice}") + self._voice_state = self._model.get_state_for_audio_prompt(voice) + self._status_queue.put("Pocket TTS ready.") + + def _custom_voice_path(self) -> str: + if not self.voice_path.strip(): + raise ValueError("Select a WAV or MP3 file, or change Voice Source to default.") + return str(_prepared_audio_prompt_path(self.voice_path, self.language, self._status_queue)) + + def _speak(self, text: str, cancel_event: threading.Event) -> None: + with self._lock: + try: + self._load() + if cancel_event.is_set(): + self._status_queue.put("Stopped previous TTS stream.") + return + assert self._model is not None + assert self._voice_state is not None + sample_rate = int(self._model.sample_rate) + self._status_queue.put("Speaking OCR text...") + with sd.OutputStream(samplerate=sample_rate, channels=1, dtype="float32") as stream: + for chunk in self._model.generate_audio_stream(self._voice_state, text): + if cancel_event.is_set(): + self._status_queue.put("Stopped previous TTS stream.") + return + audio = chunk.detach().cpu().numpy() + audio = np.asarray(audio, dtype=np.float32).reshape(-1) + if audio.size: + stream.write(_apply_volume_gain(audio, self.volume_gain)) + self._status_queue.put("Done.") + except Exception as exc: + self._status_queue.put(f"TTS error: {exc}") + + +class UvxPocketTTSServer: + def __init__( + self, + voice_path: str | Path, + language: str = "english", + quantize: bool = False, + host: str = "localhost", + port: int = 8000, + voice_source: str = "default", + default_voice: str = "alba", + custom_voice_name: str = "", + volume_gain: float = 1.0, + ) -> None: + self.voice_path = str(voice_path) + self.language = language + self.quantize = quantize + self.host = host + self.port = port + self.voice_source = voice_source + self.default_voice = default_voice + self.custom_voice_name = custom_voice_name + self.volume_gain = _clamp_volume_gain(volume_gain) + self._process: subprocess.Popen[str] | None = None + self._lock = threading.Lock() + self._speak_lock = threading.Lock() + self._cancel_lock = threading.Lock() + self._cancel_event = threading.Event() + self._active_response: requests.Response | None = None + self._server_output: collections.deque[str] = collections.deque(maxlen=80) + self._status_queue: queue.Queue[str] = queue.Queue() + + @property + def status_queue(self) -> queue.Queue[str]: + return self._status_queue + + @property + def base_url(self) -> str: + return f"http://{self.host}:{self.port}/" + + def preload_async(self) -> None: + threading.Thread(target=self._ensure_server, daemon=True).start() + + def speak_async(self, text: str) -> None: + text = text.strip() + if not text: + return + cancel_event = self._begin_new_stream() + threading.Thread(target=self._speak, args=(text, cancel_event), daemon=True).start() + + def test_async(self) -> None: + self.speak_async("This is a Pocket TTS test.") + + def close(self) -> None: + self.stop() + if self._process and self._process.poll() is None: + self._process.terminate() + + def stop(self) -> None: + with self._cancel_lock: + self._cancel_event.set() + if self._active_response is not None: + self._active_response.close() + + def _begin_new_stream(self) -> threading.Event: + with self._cancel_lock: + self._cancel_event.set() + if self._active_response is not None: + self._active_response.close() + self._cancel_event = threading.Event() + return self._cancel_event + + def _is_healthy(self) -> bool: + try: + response = requests.get(urljoin(self.base_url, "health"), timeout=2) + return response.ok + except requests.RequestException: + return False + + def _ensure_server(self) -> None: + with self._lock: + if self._is_healthy(): + self._status_queue.put("Pocket TTS server ready.") + return + if self._process is None or self._process.poll() is not None: + uvx = _find_uvx() + command = [ + str(uvx), + "pocket-tts", + "serve", + "--host", + self.host, + "--port", + str(self.port), + "--language", + self.language, + ] + if self.quantize: + command.append("--quantize") + self._server_output.clear() + self._status_queue.put(f"Starting Pocket TTS server with {uvx}...") + self._process = subprocess.Popen( + command, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + text=True, + encoding="utf-8", + errors="replace", + env=_clean_subprocess_env(), + cwd=str(Path.home()), + creationflags=_subprocess_creationflags(), + ) + threading.Thread(target=self._read_server_output, daemon=True).start() + deadline = time.monotonic() + 900 + while time.monotonic() < deadline: + if self._is_healthy(): + self._status_queue.put("Pocket TTS server ready.") + return + if self._process and self._process.poll() is not None: + output = self._server_output_tail() + detail = f"\n{output}" if output else " No server output was captured." + raise RuntimeError(f"Pocket TTS server exited with code {self._process.returncode}.{detail}") + time.sleep(1) + raise TimeoutError("Pocket TTS server did not become ready before timeout.") + + def _read_server_output(self) -> None: + process = self._process + if process is None or process.stdout is None: + return + try: + for line in process.stdout: + line = line.strip() + if line: + self._server_output.append(line) + except Exception as exc: + self._server_output.append(f"Failed to read server output: {exc}") + + def _server_output_tail(self) -> str: + if not self._server_output: + return "" + return "\n".join(list(self._server_output)[-12:]) + + def _speak(self, text: str, cancel_event: threading.Event) -> None: + with self._speak_lock: + if cancel_event.is_set(): + self._status_queue.put("Stopped previous TTS stream.") + return + try: + self._ensure_server() + if cancel_event.is_set(): + self._status_queue.put("Stopped previous TTS stream.") + return + self._status_queue.put("Requesting Pocket TTS audio...") + if self.voice_source == "default": + response = requests.post( + urljoin(self.base_url, "tts"), + data={"text": text, "voice_url": self.default_voice}, + stream=True, + timeout=900, + ) + else: + voice_url = self._custom_voice_url() + response = requests.post( + urljoin(self.base_url, "tts"), + data={"text": text, "voice_url": voice_url}, + stream=True, + timeout=900, + ) + with self._cancel_lock: + self._active_response = response + response.raise_for_status() + self._play_streaming_wav(response, cancel_event) + if not cancel_event.is_set(): + self._status_queue.put("Done.") + except requests.RequestException as exc: + if cancel_event.is_set(): + self._status_queue.put("Stopped previous TTS stream.") + else: + self._status_queue.put(f"TTS error: {exc}") + except Exception as exc: + self._status_queue.put(f"TTS error: {exc}") + finally: + with self._cancel_lock: + self._active_response = None + + def _custom_voice_path(self) -> str: + if not self.voice_path.strip(): + raise ValueError("Select a WAV or MP3 file, or change Voice Source to default.") + return self.voice_path + + def _custom_voice_url(self) -> str: + voice_state = _cached_voice_state_path( + self._custom_voice_path(), + self.language, + self._status_queue, + self.custom_voice_name, + ) + return _voice_state_server.url_for(voice_state) + + def _play_streaming_wav(self, response: requests.Response, cancel_event: threading.Event) -> None: + buffer = bytearray() + stream: sd.OutputStream | None = None + sample_width = 0 + channels = 0 + try: + for chunk in response.iter_content(chunk_size=16384): + if cancel_event.is_set(): + response.close() + self._status_queue.put("Stopped previous TTS stream.") + return + if not chunk: + continue + buffer.extend(chunk) + if stream is None: + header_end = _find_wav_data_offset(buffer) + if header_end is None: + continue + channels, sample_rate, sample_width = _read_wav_format(buffer) + stream = sd.OutputStream(samplerate=sample_rate, channels=channels, dtype="float32") + stream.start() + del buffer[:header_end] + self._status_queue.put("Streaming Pocket TTS audio...") + frame_size = sample_width * channels + usable = len(buffer) - (len(buffer) % frame_size) + if usable <= 0: + continue + pcm = bytes(buffer[:usable]) + del buffer[:usable] + audio = _pcm_to_float32(pcm, sample_width, channels) + if audio.size: + stream.write(_apply_volume_gain(audio, self.volume_gain)) + finally: + if stream is not None: + stream.stop() + stream.close() + + +class _QuietStaticFileHandler(http.server.SimpleHTTPRequestHandler): + def log_message(self, _format: str, *args: object) -> None: + return + + +class _VoiceStateServer: + def __init__(self) -> None: + self._lock = threading.Lock() + self._server: http.server.ThreadingHTTPServer | None = None + self._thread: threading.Thread | None = None + + def url_for(self, path: Path) -> str: + with self._lock: + VOICE_CACHE_DIR.mkdir(parents=True, exist_ok=True) + if self._server is None: + handler = functools.partial(_QuietStaticFileHandler, directory=str(VOICE_CACHE_DIR)) + self._server = http.server.ThreadingHTTPServer(("127.0.0.1", 0), handler) + self._thread = threading.Thread(target=self._server.serve_forever, daemon=True) + self._thread.start() + port = self._server.server_address[1] + return f"http://127.0.0.1:{port}/{quote(path.name)}" + + +_voice_state_server = _VoiceStateServer() + + +def _cached_voice_state_path( + source_path: str, + language: str, + status_queue: queue.Queue[str], + voice_name: str = "", +) -> Path: + source = Path(source_path) + if not source.exists(): + raise FileNotFoundError(f"Voice file not found: {source}") + VOICE_CACHE_DIR.mkdir(parents=True, exist_ok=True) + stat = source.stat() + digest = hashlib.sha256( + f"{source.resolve()}|{stat.st_mtime_ns}|{stat.st_size}|{language}".encode("utf-8") + ).hexdigest()[:24] + prefix = safe_voice_slug(voice_name) if voice_name.strip() else source.stem + target = VOICE_CACHE_DIR / f"{safe_voice_slug(prefix)}-{digest}.safetensors" + if source.suffix.casefold() == ".safetensors": + if not target.exists(): + shutil.copy2(source, target) + status_queue.put("Using cached custom voice state.") + return target + if target.exists(): + status_queue.put("Using cached custom voice state.") + return target + + prompt_source = _prepared_audio_prompt_path(source, language, status_queue, digest) + status_queue.put("Exporting custom voice cache; first run can take a while.") + command = [ + str(_find_uvx()), + "pocket-tts", + "export-voice", + str(prompt_source), + str(target), + "--language", + language, + "--quiet", + ] + subprocess.run(command, check=True, env=_clean_subprocess_env(), creationflags=_subprocess_creationflags()) + status_queue.put("Custom voice cache ready.") + return target + + +def _prepared_audio_prompt_path( + source_path: str | Path, + language: str, + status_queue: queue.Queue[str], + digest: str | None = None, +) -> Path: + source = Path(source_path) + if source.suffix.casefold() != ".mp3": + return source + VOICE_CACHE_DIR.mkdir(parents=True, exist_ok=True) + if digest is None: + stat = source.stat() + digest = hashlib.sha256( + f"{source.resolve()}|{stat.st_mtime_ns}|{stat.st_size}|{language}".encode("utf-8") + ).hexdigest()[:24] + target = VOICE_CACHE_DIR / f"{safe_voice_slug(source.stem)}-{digest}.wav" + if target.exists(): + status_queue.put("Using cached WAV conversion for MP3 voice.") + return target + status_queue.put("Converting MP3 voice reference to WAV...") + _convert_mp3_to_wav(source, target) + status_queue.put("MP3 voice conversion ready.") + return target + + +def _convert_mp3_to_wav(source: Path, target: Path) -> None: + try: + import imageio_ffmpeg + except ImportError as exc: + raise RuntimeError("MP3 custom voices require imageio-ffmpeg. Reinstall Seshat TTS dependencies.") from exc + + command = [ + imageio_ffmpeg.get_ffmpeg_exe(), + "-hide_banner", + "-loglevel", + "error", + "-y", + "-i", + str(source), + "-vn", + "-ac", + "1", + "-ar", + "24000", + "-sample_fmt", + "s16", + str(target), + ] + subprocess.run(command, check=True, env=_clean_subprocess_env(), creationflags=_subprocess_creationflags()) + + +def _find_wav_data_offset(data: bytearray) -> int | None: + marker = data.find(b"data") + if marker < 0 or len(data) < marker + 8: + return None + return marker + 8 + + +def _read_wav_format(data: bytearray) -> tuple[int, int, int]: + if len(data) < 36 or data[:4] != b"RIFF" or data[8:12] != b"WAVE": + raise ValueError("Response is not a WAV stream.") + fmt = data.find(b"fmt ") + if fmt < 0 or len(data) < fmt + 24: + raise ValueError("WAV stream is missing fmt chunk.") + channels = int.from_bytes(data[fmt + 10 : fmt + 12], "little") + sample_rate = int.from_bytes(data[fmt + 12 : fmt + 16], "little") + bits_per_sample = int.from_bytes(data[fmt + 22 : fmt + 24], "little") + return channels, sample_rate, bits_per_sample // 8 + + +def _pcm_to_float32(pcm: bytes, sample_width: int, channels: int) -> np.ndarray: + if sample_width == 2: + audio = np.frombuffer(pcm, dtype=np.int16).astype(np.float32) / 32768.0 + elif sample_width == 4: + audio = np.frombuffer(pcm, dtype=np.int32).astype(np.float32) / 2147483648.0 + else: + raise ValueError(f"Unsupported WAV sample width: {sample_width}") + if channels > 1: + return audio.reshape(-1, channels) + return audio.reshape(-1, 1) + + +def _clamp_volume_gain(value: float) -> float: + return max(0.0, min(float(value), 3.0)) + + +def _apply_volume_gain(audio: np.ndarray, volume_gain: float) -> np.ndarray: + gain = _clamp_volume_gain(volume_gain) + if gain == 1.0: + return audio + return np.clip(audio * gain, -1.0, 1.0).astype(np.float32, copy=False) + + +def _find_uvx() -> Path: + bundled = resource_path("tools/uvx.exe") + if bundled.exists(): + return bundled + found = shutil.which("uvx") + if found: + return Path(found) + candidates = [ + Path.home() / ".local" / "bin" / "uvx.exe", + Path.home() / ".cargo" / "bin" / "uvx.exe", + ] + for candidate in candidates: + if candidate.exists(): + return candidate + raise FileNotFoundError("uvx.exe was not found on PATH. Install uv or add uvx.exe to PATH.") + + +def _clean_subprocess_env() -> dict[str, str]: + env = os.environ.copy() + for key in list(env): + if key.startswith("_PYI") or key.startswith("PYINSTALLER"): + env.pop(key, None) + env.pop("PYTHONHOME", None) + env.pop("PYTHONPATH", None) + + user_bin = Path.home() / ".local" / "bin" + if user_bin.exists(): + env["PATH"] = str(user_bin) + os.pathsep + env.get("PATH", "") + return env + + +def _subprocess_creationflags() -> int: + if os.name != "nt": + return 0 + return int(getattr(subprocess, "CREATE_NO_WINDOW", 0)) diff --git a/src/seshat_tts/voices.py b/src/seshat_tts/voices.py new file mode 100644 index 0000000..1f2f5b1 --- /dev/null +++ b/src/seshat_tts/voices.py @@ -0,0 +1,55 @@ +from __future__ import annotations + +import json +import re +from dataclasses import asdict, dataclass +from pathlib import Path +from typing import Any + +from .config import APP_DIR + + +VOICE_PROFILES_PATH = APP_DIR / "voice_profiles.json" + + +@dataclass(slots=True) +class VoiceProfile: + name: str + path: str + + +def safe_voice_slug(name: str) -> str: + slug = re.sub(r"[^a-zA-Z0-9._-]+", "-", name.strip()).strip("-._") + return slug or "custom-voice" + + +def load_voice_profiles(path: Path = VOICE_PROFILES_PATH) -> list[VoiceProfile]: + if not path.exists(): + return [] + data = json.loads(path.read_text(encoding="utf-8")) + profiles: list[VoiceProfile] = [] + for item in data if isinstance(data, list) else []: + if not isinstance(item, dict): + continue + name = str(item.get("name", "")).strip() + voice_path = str(item.get("path", "")).strip() + if name and voice_path: + profiles.append(VoiceProfile(name=name, path=voice_path)) + return profiles + + +def save_voice_profiles(profiles: list[VoiceProfile], path: Path = VOICE_PROFILES_PATH) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(json.dumps([asdict(profile) for profile in profiles], indent=2), encoding="utf-8") + + +def upsert_voice_profile(profile: VoiceProfile, path: Path = VOICE_PROFILES_PATH) -> list[VoiceProfile]: + profiles = [item for item in load_voice_profiles(path) if item.name != profile.name] + profiles.append(profile) + profiles.sort(key=lambda item: item.name.casefold()) + save_voice_profiles(profiles, path) + return profiles + + +def voice_profile_by_name(name: str, profiles: list[VoiceProfile]) -> VoiceProfile | None: + return next((profile for profile in profiles if profile.name == name), None) diff --git a/src/seshat_tts/windows.py b/src/seshat_tts/windows.py new file mode 100644 index 0000000..a543b8d --- /dev/null +++ b/src/seshat_tts/windows.py @@ -0,0 +1,70 @@ +from __future__ import annotations + +from dataclasses import dataclass + +import win32gui + + +@dataclass(frozen=True, slots=True) +class WindowInfo: + hwnd: int + title: str + left: int + top: int + right: int + bottom: int + + @property + def width(self) -> int: + return self.right - self.left + + @property + def height(self) -> int: + return self.bottom - self.top + + @property + def label(self) -> str: + return f"{self.title} [{self.width}x{self.height} at {self.left},{self.top}]" + + +def _is_candidate(hwnd: int) -> bool: + if not win32gui.IsWindowVisible(hwnd): + return False + title = win32gui.GetWindowText(hwnd).strip() + if not title: + return False + left, top, right, bottom = win32gui.GetWindowRect(hwnd) + return (right - left) > 50 and (bottom - top) > 50 + + +def list_visible_windows() -> list[WindowInfo]: + windows: list[WindowInfo] = [] + + def callback(hwnd: int, _extra: object) -> None: + if _is_candidate(hwnd): + left, top, right, bottom = win32gui.GetWindowRect(hwnd) + windows.append( + WindowInfo( + hwnd=hwnd, + title=win32gui.GetWindowText(hwnd).strip(), + left=left, + top=top, + right=right, + bottom=bottom, + ) + ) + + win32gui.EnumWindows(callback, None) + windows.sort(key=lambda item: item.title.casefold()) + return windows + + +def find_window_by_title(title: str) -> WindowInfo | None: + title = title.strip() + if not title: + return None + for window in list_visible_windows(): + if window.title == title: + return window + needle = title.casefold() + return next((window for window in list_visible_windows() if needle in window.title.casefold()), None) diff --git a/tests/test_config.py b/tests/test_config.py new file mode 100644 index 0000000..7ff3721 --- /dev/null +++ b/tests/test_config.py @@ -0,0 +1,80 @@ +from pathlib import Path +import json + +from seshat_tts.config import AppConfig, Rect, load_config, save_config + + +def test_config_roundtrip(tmp_path: Path) -> None: + path = tmp_path / "config.json" + config = AppConfig( + monitor_index=2, + hotkey="ctrl+shift+d", + capture_region_hotkey="ctrl+shift+r", + stop_hotkey="ctrl+shift+s", + dialogue_rect=Rect(left=1, top=2, width=3, height=4), + tesseract_cmd="C:/Tesseract/tesseract.exe", + voice_source="custom-wav", + default_voice="alba", + voice_path="voice.mp3", + language="english", + quantize_tts=True, + volume_gain=1.75, + last_text="hello", + ) + + save_config(config, path) + + assert load_config(path) == config + + +def test_load_config_removes_old_region_metadata_from_last_text(tmp_path: Path) -> None: + path = tmp_path / "config.json" + path.write_text( + json.dumps( + { + "last_text": "Capture region: 85,51 628x84\nText region: 85,44 633x77\n\nA line to read." + } + ), + encoding="utf-8", + ) + + assert load_config(path).last_text == "A line to read." + + +def test_load_config_reads_llm_settings(tmp_path: Path) -> None: + path = tmp_path / "config.json" + path.write_text( + json.dumps( + { + "llm_enabled": True, + "llm_base_url": "http://127.0.0.1:11434/v1", + "llm_api_key": "local", + "llm_model": "unsloth-local", + "llm_timeout": 1.5, + "llm_max_tokens": 64, + "llm_disable_thinking": False, + "llm_image_extraction": True, + "llm_system_prompt": "clean this", + } + ), + encoding="utf-8", + ) + + config = load_config(path) + + assert config.llm_enabled is True + assert config.llm_base_url == "http://127.0.0.1:11434/v1" + assert config.llm_api_key == "local" + assert config.llm_model == "unsloth-local" + assert config.llm_timeout == 1.5 + assert config.llm_max_tokens == 64 + assert config.llm_disable_thinking is False + assert config.llm_image_extraction is True + assert config.llm_system_prompt == "clean this" + + +def test_load_config_forces_english_language(tmp_path: Path) -> None: + path = tmp_path / "config.json" + path.write_text(json.dumps({"language": "french"}), encoding="utf-8") + + assert load_config(path).language == "english" diff --git a/tests/test_llm.py b/tests/test_llm.py new file mode 100644 index 0000000..c0ef3b4 --- /dev/null +++ b/tests/test_llm.py @@ -0,0 +1,118 @@ +from __future__ import annotations + +from dataclasses import dataclass + +from PIL import Image + +from seshat_tts.llm import process_image_with_llm, process_text_with_llm + + +@dataclass +class _Message: + content: str + + +@dataclass +class _Choice: + message: _Message + + +@dataclass +class _Response: + choices: list[_Choice] + + +class _Completions: + def __init__(self) -> None: + self.kwargs: dict[str, object] | None = None + + def create(self, **kwargs: object) -> _Response: + self.kwargs = kwargs + return _Response([_Choice(_Message("Cleaned text."))]) + + +class _Client: + def __init__(self) -> None: + self.chat = type("Chat", (), {"completions": _Completions()})() + + +def test_llm_disabled_returns_original_text() -> None: + assert ( + process_text_with_llm( + " OCR text ", + enabled=False, + base_url="http://127.0.0.1:8000/v1", + api_key="local", + model="unsloth", + system_prompt="clean", + ) + == "OCR text" + ) + + +def test_llm_enabled_uses_openai_compatible_chat_client() -> None: + client = _Client() + + result = process_text_with_llm( + "OCR text", + enabled=True, + base_url="http://127.0.0.1:8000/v1", + api_key="local", + model="unsloth-model", + system_prompt="clean", + timeout=1, + max_tokens=32, + client=client, + ) + + assert result == "Cleaned text." + assert client.chat.completions.kwargs is not None + assert client.chat.completions.kwargs["model"] == "unsloth-model" + assert client.chat.completions.kwargs["temperature"] == 0 + assert client.chat.completions.kwargs["extra_body"] == { + "chat_template_kwargs": {"enable_thinking": False}, + "enable_thinking": False, + "reasoning_effort": "none", + } + + +def test_llm_can_send_without_disable_thinking_metadata() -> None: + client = _Client() + + process_text_with_llm( + "OCR text", + enabled=True, + base_url="http://127.0.0.1:8000/v1", + api_key="local", + model="unsloth-model", + system_prompt="clean", + disable_thinking=False, + client=client, + ) + + assert client.chat.completions.kwargs is not None + assert "extra_body" not in client.chat.completions.kwargs + + +def test_llm_can_extract_text_from_image_region() -> None: + client = _Client() + image = Image.new("RGB", (16, 8), "black") + + result = process_image_with_llm( + image, + base_url="http://127.0.0.1:8000/v1", + api_key="local", + model="vision-model", + timeout=1, + max_tokens=64, + client=client, + ) + + assert result == "Cleaned text." + assert client.chat.completions.kwargs is not None + assert client.chat.completions.kwargs["model"] == "vision-model" + messages = client.chat.completions.kwargs["messages"] + user_content = messages[1]["content"] + assert user_content[0]["type"] == "text" + assert user_content[1]["type"] == "image_url" + assert user_content[1]["image_url"]["url"].startswith("data:image/png;base64,") diff --git a/tests/test_ocr.py b/tests/test_ocr.py new file mode 100644 index 0000000..61451e0 --- /dev/null +++ b/tests/test_ocr.py @@ -0,0 +1,22 @@ +from seshat_tts.ocr import extract_text_from_lines + + +def test_selected_text_does_not_skip_first_line() -> None: + lines = ["A large group of humanoids came from the foothills", "and headed north not long ago."] + + assert ( + extract_text_from_lines(lines) + == "A large group of humanoids came from the foothills and headed north not long ago." + ) + + +def test_selected_text_includes_choice_marker_text_when_inside_region() -> None: + lines = ["Line to read.", "|. Continue"] + + assert extract_text_from_lines(lines) == "Line to read. |. Continue" + + +def test_selected_text_includes_pipe_marker_without_dot_when_inside_region() -> None: + lines = ["Line to read.", "| Continue"] + + assert extract_text_from_lines(lines) == "Line to read. | Continue" diff --git a/tests/test_tts.py b/tests/test_tts.py new file mode 100644 index 0000000..6498229 --- /dev/null +++ b/tests/test_tts.py @@ -0,0 +1,34 @@ +from pathlib import Path +import queue + +from seshat_tts import tts + + +def test_prepared_audio_prompt_leaves_wav_unchanged(tmp_path: Path) -> None: + source = tmp_path / "voice.wav" + source.write_bytes(b"wav") + + assert tts._prepared_audio_prompt_path(source, "english", queue.Queue()) == source + + +def test_prepared_audio_prompt_converts_mp3_once(tmp_path: Path, monkeypatch) -> None: + source = tmp_path / "voice.mp3" + source.write_bytes(b"mp3") + cache = tmp_path / "cache" + calls: list[tuple[Path, Path]] = [] + + monkeypatch.setattr(tts, "VOICE_CACHE_DIR", cache) + + def fake_convert(input_path: Path, output_path: Path) -> None: + calls.append((input_path, output_path)) + output_path.write_bytes(b"wav") + + monkeypatch.setattr(tts, "_convert_mp3_to_wav", fake_convert) + + first = tts._prepared_audio_prompt_path(source, "english", queue.Queue()) + second = tts._prepared_audio_prompt_path(source, "english", queue.Queue()) + + assert first == second + assert first.suffix == ".wav" + assert first.exists() + assert calls == [(source, first)]