seshat-tts

This commit is contained in:
2026-05-22 05:54:01 -04:00
commit cdb4563633
48 changed files with 4192 additions and 0 deletions
+13
View File
@@ -0,0 +1,13 @@
root = true
[*]
charset = utf-8
end_of_line = crlf
insert_final_newline = true
trim_trailing_whitespace = true
[*.{md,yml,yaml,toml,py}]
end_of_line = lf
[*.ps1]
end_of_line = crlf
+64
View File
@@ -0,0 +1,64 @@
name: Bug report
description: Report a reproducible Seshat TTS problem
title: "[Bug]: "
labels: ["bug", "needs-triage"]
body:
- type: textarea
id: summary
attributes:
label: Summary
description: What happened?
validations:
required: true
- type: textarea
id: steps
attributes:
label: Steps to reproduce
description: List the exact steps.
placeholder: |
1. Open app
2. Select window
3. Press hotkey
validations:
required: true
- type: textarea
id: expected
attributes:
label: Expected behavior
validations:
required: true
- type: textarea
id: actual
attributes:
label: Actual behavior
validations:
required: true
- type: input
id: version
attributes:
label: App version or commit
- type: input
id: windows
attributes:
label: Windows version
- type: dropdown
id: capture-mode
attributes:
label: Capture mode
options:
- monitor
- window
- unknown
- type: dropdown
id: voice-mode
attributes:
label: Voice mode
options:
- default
- custom-wav
- unknown
- type: textarea
id: logs
attributes:
label: Error text or logs
description: Remove secrets before pasting.
+8
View File
@@ -0,0 +1,8 @@
blank_issues_enabled: false
contact_links:
- name: Support
url: mailto:support@scriptriva.com
about: Contact Scriptriva Inc. for support inquiries.
- name: Security vulnerability
url: mailto:support@scriptriva.com
about: Please report security issues privately.
@@ -0,0 +1,28 @@
name: Feature request
description: Suggest a focused improvement
title: "[Feature]: "
labels: ["enhancement", "needs-triage"]
body:
- type: textarea
id: problem
attributes:
label: Problem
description: What user problem does this solve?
validations:
required: true
- type: textarea
id: proposal
attributes:
label: Proposed solution
description: What should change?
validations:
required: true
- type: textarea
id: alternatives
attributes:
label: Alternatives considered
- type: textarea
id: scope
attributes:
label: Scope and risks
description: What files or behavior might be affected?
+16
View File
@@ -0,0 +1,16 @@
## Summary
Describe what changed and why.
## Testing
- [ ] `python -m pytest -q`
- [ ] GUI smoke tested, if UI behavior changed
- [ ] Build tested, if packaging changed
## Checklist
- [ ] No secrets, voice samples, local configs, or generated binaries committed
- [ ] Documentation updated if behavior changed
- [ ] Third-party notices preserved
- [ ] Change is scoped and maintainable
+11
View File
@@ -0,0 +1,11 @@
version: 2
updates:
- package-ecosystem: "pip"
directory: "/"
schedule:
interval: "weekly"
open-pull-requests-limit: 5
- package-ecosystem: "github-actions"
directory: "/"
schedule:
interval: "weekly"
+36
View File
@@ -0,0 +1,36 @@
name: CI
on:
pull_request:
push:
branches: ["main"]
permissions:
contents: read
jobs:
tests:
name: Tests
runs-on: windows-latest
strategy:
fail-fast: false
matrix:
python-version: ["3.10", "3.13"]
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}
cache: pip
- name: Install
run: |
python -m pip install --upgrade pip
python -m pip install -e ".[test]"
- name: Compile
run: python -m compileall src tests scripts\pyinstaller_entry.py
- name: Test
env:
PYTHONPATH: src
run: python -m pytest -q
+13
View File
@@ -0,0 +1,13 @@
build/*
build/launcher/*
dist/*
scripts/build_output.txt
*.egg-info/
__pycache__/*
.pytest_cache/*
.venv/
*.pyc
tts_output.wav
*.log
src\seshat_tts\__pycache__/*
api_key.txt
+23
View File
@@ -0,0 +1,23 @@
# Code of Conduct
Seshat TTS follows the [Contributor Covenant Code of Conduct](https://www.contributor-covenant.org/version/2/1/code_of_conduct/).
## Expected Behavior
- Be respectful, direct, and constructive.
- Assume good intent while still holding technical claims to evidence.
- Keep discussion focused on the project and its users.
- Report security or abuse issues through the security process instead of public threads.
## Unacceptable Behavior
- Harassment, threats, discriminatory language, or targeted personal attacks.
- Publishing private information or credentials.
- Using the project to support abuse, fraud, malware, unauthorized surveillance, or unauthorized voice cloning.
- Repeatedly derailing technical discussion after maintainers ask to refocus.
## Enforcement
Scriptriva Inc. maintainers may moderate, edit, hide, lock, or remove issues, comments, pull requests, and releases that violate this code. Serious or repeated violations may result in a ban from project spaces.
For conduct concerns, contact support@scriptriva.com.
+39
View File
@@ -0,0 +1,39 @@
# Contributing
Thank you for helping improve Seshat TTS. This project is maintained by Scriptriva Inc. and welcomes focused community contributions.
## Development Setup
```powershell
python -m venv .venv
.\.venv\Scripts\Activate.ps1
python -m pip install -e ".[test]"
$env:PYTHONPATH='src'
python -m pytest -q
```
## Contribution Areas
- OCR accuracy and preprocessing.
- Window capture reliability.
- TTS stream cancellation and playback.
- Local OpenAI-compatible LLM cleanup.
- Packaging and documentation.
- Accessibility and usability.
## Pull Request Expectations
- Keep changes scoped and explain user-visible behavior.
- Add or update tests for behavior changes.
- Do not commit secrets, voice samples, generated voice caches, build outputs, or local config files.
- Preserve third-party notices and license files.
- Follow existing code style and avoid unrelated refactors.
- Run `python -m pytest -q` before opening a pull request.
## Licensing
By contributing, you agree that your contribution may be used under the project license and under any future license chosen by Scriptriva Inc. for this project. You also confirm that you have the right to submit the contribution.
## Security
Do not open public issues for vulnerabilities. Follow [SECURITY.md](SECURITY.md).
+30
View File
@@ -0,0 +1,30 @@
# Governance
Seshat TTS is maintained by Scriptriva Inc.
## Maintainer Authority
Scriptriva Inc. maintainers decide:
- project roadmap and priorities
- releases and distribution
- licensing and branding
- security response
- pull request acceptance
- issue moderation
## Community Contributions
Community contributions are welcome when they align with the project goals and license. Maintainers may request changes, defer work, close inactive issues, or reject changes that increase maintenance risk.
## Decision Principles
- User safety and privacy come first.
- Runtime reliability matters more than novelty.
- Captured text, voice files, API keys, and generated caches are sensitive user data.
- Third-party licenses and notices must remain clear.
- UI changes should keep the app understandable for non-developer users.
## Maintainer Contact
For support, licensing, governance, or security routing: support@scriptriva.com
+286
View File
@@ -0,0 +1,286 @@
Scriptriva Public Source License 1.0
Copyright (c) 2026 Scriptriva Inc.
This license may be reused for other projects by replacing the copyright owner,
year, project metadata, support contact, and any project-specific notices. The
terms below use universal language so the license can govern any software,
documentation, assets, build scripts, or related materials distributed under it.
This is a public source license. It allows commercial use subject to the terms
below. It is not the MIT License, and it is not an Open Source Initiative
approved open source license because it contains attribution, trademark,
distribution, safety, and same-functionality product restrictions.
1. Definitions
"Licensor" means the person or entity identified in the copyright notice for the
Licensed Work.
"Licensed Work" means the source code, object code, documentation, graphical
interface, build scripts, first-party assets, examples, configuration, and other
materials distributed by Licensor under this license, except for third-party
materials governed by separate licenses or notices.
"You" means any individual, company, organization, or other legal entity that
accesses, uses, copies, modifies, distributes, links to, embeds, hosts, or
otherwise interacts with the Licensed Work.
"Source Form" means the preferred form of the Licensed Work for reading,
maintaining, or making modifications.
"Object Form" means any compiled, packaged, bundled, generated, minified,
containerized, hosted, or machine-executable form of the Licensed Work.
"Modification" means any change, translation, adaptation, refactor, port,
extension, integration, plugin, compilation, packaging, or derivative work based
on the Licensed Work.
"Public Project" means any repository, package, release, website, distribution,
service, application, marketplace listing, publication, or other disclosure made
available outside your own household or internal organization.
"Commercial Use" means any use intended for, connected to, or resulting in
direct or indirect monetary value, including selling, licensing, hosting,
offering paid support, bundling with a paid product, including in a commercial
service, using in revenue-generating operations, using to provide professional
services, using in a product trial or freemium funnel, or using to gain a
commercial advantage.
"Same-Functionality Product" means any software, service, product, script,
workflow, plugin, system, model, dataset, or hosted offering that reproduces,
substitutes for, markets against, or provides substantially similar core
functionality to the Licensed Work. The analysis depends on the purpose and
functionality of the original project, not only on implementation details.
"First-Party Marks and Assets" means names, trademarks, service marks, logos,
icons, mascots, banners, artwork, screenshots, trade dress, product identities,
and branding owned or supplied by Licensor.
2. Grant of Rights
Subject to all terms of this license, Licensor grants you a worldwide,
royalty-free, non-exclusive, non-transferable, non-sublicensable, revocable
license to:
a. view, read, inspect, and study the Licensed Work;
b. run the Licensed Work for personal, internal, academic, research, evaluation,
accessibility, interoperability, and Commercial Use;
c. copy the Licensed Work for backup, archival, testing, internal deployment,
permitted Commercial Use, and permitted distribution;
d. create Modifications for personal, internal, academic, research, evaluation,
accessibility, interoperability, and Commercial Use;
e. publish patches, bug reports, interoperability notes, benchmarks, security
findings, documentation, and compatibility layers related to the Licensed Work;
f. distribute the unmodified Licensed Work or Modifications when the distribution
complies with this license, including the attribution, notice, branding,
third-party, and Same-Functionality Product restrictions; and
g. link to, call, embed, automate, or interoperate with the Licensed Work when
your use complies with this license.
No rights are granted except those expressly stated in this license.
3. Attribution and Notice Requirements
If you copy, modify, distribute, link to, embed, reference, host, or publish any
portion of the Licensed Work in Source Form or Object Form, you must:
a. preserve all copyright, license, attribution, provenance, and third-party
notice files;
b. include a clear notice identifying the Licensed Work, the Licensor, the
copyright notice, and the Scriptriva Public Source License 1.0;
c. identify material Modifications you made;
d. provide a copy of this license with the relevant copy, package, repository,
release, service, documentation, or distribution;
e. avoid implying that Licensor sponsors, endorses, maintains, certifies, or is
affiliated with your project unless Licensor gives written permission; and
f. preserve third-party notices and comply with third-party licenses for
components not owned by Licensor.
For graphical applications, documentation sites, public demos, app stores,
release pages, hosted services, and packaged distributions, attribution must be
reasonably visible in at least one of: an About window, license page, notices
file, README, package metadata, documentation page, legal page, or equivalent
user-accessible location.
4. Commercial Use
Commercial Use is allowed under this license when it complies with all terms of
this license. You do not need a separate commercial license merely to use the
Licensed Work commercially, sell services that use the Licensed Work internally,
or include the Licensed Work in a larger commercial system, provided that your
use does not violate Sections 5, 6, 7, or any third-party license.
5. Same-Functionality Product Restrictions
You may not, without prior written permission from Licensor:
a. sell, rent, lease, sublicense, monetize, host, or otherwise charge for access
to the Licensed Work or a Modification as a standalone Same-Functionality
Product;
b. use the Licensed Work or a Modification to create, operate, improve, train,
benchmark, market, or distribute a Same-Functionality Product;
c. rebrand the Licensed Work or a Modification as a competing product;
d. offer the Licensed Work or a Modification as a hosted service, cloud service,
managed service, paid API, subscription product, marketplace product, paid
plugin, paid desktop application, or paid utility whose primary purpose is
substantially similar to the Licensed Work; or
e. use the Licensed Work to train, fine-tune, evaluate, benchmark, or improve a
model, dataset, agent, or automation intended to compete with or replicate the
core functionality of the Licensed Work.
This section does not prohibit ordinary Commercial Use permitted by Section 4,
interoperability, independent implementations that do not use the Licensed Work,
or public criticism, benchmarking, security review, or compatibility testing.
6. Public Projects, Linking, and Modifications
You may reference, link to, embed, or interoperate with the Licensed Work in a
Public Project, including a commercial Public Project, only if:
a. the Public Project does not violate the Same-Functionality Product
restrictions in Section 5;
b. the Public Project includes the attribution required by Section 3;
c. your use does not make First-Party Marks and Assets appear to be your own;
d. your repository, package, documentation, or notice file clearly identifies
which portions are yours and which portions are derived from, include, or
interface with the Licensed Work; and
e. any Modification of the Licensed Work remains under this license unless
Licensor gives written permission for another license.
7. Restrictions
You may not:
a. remove, obscure, or alter copyright, license, trademark, attribution,
provenance, or third-party notice files;
b. use First-Party Marks and Assets in a way that suggests endorsement,
affiliation, certification, or ownership by you;
c. redistribute First-Party Marks and Assets separately from the Licensed Work
unless Licensor gives written permission;
d. use the Licensed Work to violate law, privacy rights, publicity rights, voice
rights, copyright, platform terms, or anti-circumvention rules;
e. use the Licensed Work to clone, synthesize, imitate, capture, extract, or
process a person's voice, likeness, private data, credentials, or protected
content without appropriate rights or permission;
f. use the Licensed Work in surveillance, credential theft, fraud, malware,
spyware, covert data extraction, or deceptive automation workflows;
g. bypass, disable, remove, or misrepresent license notices, support notices,
security notices, safety checks, provenance indicators, or attribution flows;
h. grant sublicenses or downstream rights broader than the rights granted to you
under this license; or
i. use the Licensed Work in a manner that creates legal obligations for Licensor
or requires Licensor to license First-Party Marks and Assets, patents, trade
secrets, private data, or third-party materials.
8. Contributions
Unless Licensor signs a separate written agreement with you, any patch, pull
request, issue suggestion, documentation change, bug fix, code sample, asset,
or other contribution you submit to Licensor may be used, modified, relicensed,
sublicensed, distributed, and commercialized by Licensor without restriction or
payment to you. You represent that you have the right to submit the contribution.
9. Trademarks and Branding
This license does not grant trademark rights. Licensor retains all rights in
First-Party Marks and Assets. You may use the project name only as necessary for
accurate attribution, compatibility statements, or references to the unmodified
Licensed Work.
10. Third-Party Components
Third-party components, dependencies, binaries, models, voices, data files,
fonts, media, and tools are governed by their own licenses and terms. This
license does not change, replace, expand, or restrict rights you may have under
those third-party licenses. If a third-party license gives you broader rights
for that third-party component, those broader rights apply only to that
third-party component and not to the Licensed Work owned by Licensor.
11. Additional Permissions
Separate permissions, commercial redistribution permissions,
Same-Functionality Product permissions, branding permissions, trademark
permissions, and other exceptions may be available from Licensor. For this
project, support and licensing inquiries may be sent to support@scriptriva.com.
12. Termination
Your rights under this license terminate automatically if you violate any term
of this license. Upon termination, you must stop using, copying, modifying,
distributing, hosting, linking to, or otherwise exploiting the Licensed Work and
destroy or delete all copies under your control, except copies you are legally
required to retain for archival compliance. Sections intended to survive,
including attribution, restrictions, third-party notices, trademarks,
disclaimers, limitation of liability, and governing terms, survive termination.
Licensor may reinstate your rights in writing or by accepting a cure, but
reinstatement is not automatic unless required by applicable law.
13. No Warranty
THE LICENSED WORK IS PROVIDED "AS IS" AND "AS AVAILABLE", WITHOUT WARRANTY OF
ANY KIND, EXPRESS, IMPLIED, STATUTORY, OR OTHERWISE, INCLUDING WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE, NON-INFRINGEMENT,
SECURITY, ACCURACY, AVAILABILITY, OR ERROR-FREE OPERATION.
14. Limitation of Liability
TO THE MAXIMUM EXTENT PERMITTED BY LAW, LICENSOR AND ITS OFFICERS, EMPLOYEES,
CONTRACTORS, CONTRIBUTORS, AND LICENSORS WILL NOT BE LIABLE FOR ANY INDIRECT,
INCIDENTAL, SPECIAL, CONSEQUENTIAL, EXEMPLARY, PUNITIVE, OR BUSINESS-
INTERRUPTION DAMAGES; LOST PROFITS; LOST REVENUE; LOST DATA; LOSS OF GOODWILL;
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; OR CLAIMS ARISING FROM USE,
INABILITY TO USE, MODIFICATION, DISTRIBUTION, HOSTING, OR RELIANCE ON THE
LICENSED WORK, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
15. Compliance With Law
You are responsible for complying with applicable laws, regulations, platform
terms, export controls, privacy rules, accessibility rules, and intellectual
property rights when using the Licensed Work.
16. License Versions
This license is version 1.0. Scriptriva Inc. may publish new versions of this
license. A file explicitly licensed under this version remains under this
version unless Licensor states otherwise in writing.
17. Severability
If any provision of this license is held unenforceable, the remaining provisions
remain in effect. The unenforceable provision will be interpreted as closely as
possible to its original purpose while remaining enforceable.
18. Entire License
This license is the complete agreement for the rights granted here. Any waiver,
exception, special permission, or alternate license must be in writing from
Licensor.
+274
View File
@@ -0,0 +1,274 @@
# Seshat TTS
![Scriptriva Inc.](resources/banner.jpg)
<p align="center">
<img src="resources/logo.png" alt="Scriptriva Inc. logo" width="160">
</p>
[![Python 3.10-3.14](https://img.shields.io/badge/python-3.10--3.14-3776ab?logo=python&logoColor=white)](https://www.python.org/)
[![Windows](https://img.shields.io/badge/platform-Windows-0078d4?logo=windows&logoColor=white)](https://www.microsoft.com/windows)
[![License: SPSL 1.0](https://img.shields.io/badge/license-SPSL%201.0-green.svg)](LICENSE)
[![Pocket TTS](https://img.shields.io/badge/voice-Kyutai%20Pocket%20TTS-111827)](https://github.com/kyutai-labs/pocket-tts)
[![Tesseract OCR](https://img.shields.io/badge/OCR-Tesseract-4b5563)](https://github.com/tesseract-ocr/tesseract)
Seshat TTS is a Windows GUI utility for realtime audio streaming for games, or apps. Pick a monitor or window, drag one capture region over the text, press one hotkey, and the selected text is extracted with Tesseract OCR or a local vision LLM, then streamed through Kyutai Pocket TTS.
Maintained by Scriptriva Inc.
For support inquiries email: support@scriptriva.com
<p align="center">
<img src="resources/anime_meme.gif" alt="Anime girl studying">
<p>
## What It Does
- Captures one selected screen region from a monitor or a chosen window.
- Runs Tesseract OCR on that exact region, or sends the region image directly to a local vision-capable LLM for text extraction.
- Streams the extracted text through Pocket TTS in realtime.
- Lets you use a built-in Pocket TTS voice for speed or upload a custom WAV/MP3 reference voice.
- Optionally routes OCR text through a local OpenAI-compatible LLM endpoint before speech.
- Includes a 0-300% playback volume slider for quiet voices or noisy games.
- Stops any active audio stream when a new read starts, so repeated hotkey presses do not overlap.
- Caches custom voice state as `.safetensors` for faster repeat custom-voice reads when using the `uvx-server` backend.
<p align="center">
<img src="resources/anime_yapping.gif" alt="Fast yapping mode">
<p>
## Requirements
- Windows 10/11.
- Python 3.10 through 3.14 when running from source or building.
- Tesseract OCR for Windows when running from source or building a portable EXE with bundled OCR.
- `uvx` when running from source, or when building a portable EXE with bundled uvx.
- A working audio output device.
Install Tesseract:
```powershell
winget install UB-Mannheim.TesseractOCR
```
Install `uvx`:
```powershell
powershell -ExecutionPolicy ByPass -c "irm https://astral.sh/uv/install.ps1 | iex"
```
Install Seshat TTS for development or for the fast launcher:
```powershell
python -m venv .venv
.\.venv\Scripts\Activate.ps1
python -m pip install -e .[test]
```
## Build Before Use
For a single-file portable EXE, build with:
```powershell
.\scripts\build_exe.ps1
```
Portable output:
```powershell
.\dist\seshat-tts.exe
```
That EXE bundles the Seshat GUI/runtime files, app resources, `uvx.exe` if it is available on the build machine, and Tesseract OCR files if Tesseract is installed at `C:\Program Files\Tesseract-OCR`. You can override the OCR bundle source before building:
```powershell
$env:SESHAT_TESSERACT_DIR='D:\Tools\Tesseract-OCR'
.\scripts\build_exe.ps1
```
For the old one-folder PyInstaller build:
```powershell
.\scripts\build_exe.ps1 -OneDir
```
One-folder output:
```text
dist\seshat-tts\seshat-tts.exe
```
The portable EXE still uses Pocket TTS through `uvx-server`. It does not freeze Torch/Pocket TTS inside the EXE because that path has been unreliable on Windows and can trigger native DLL initialization failures. First Pocket TTS use can still download/cache the Pocket TTS tool and model data under the user's normal cache directories, but no separate Python, Tesseract, or uvx install should be needed when those files were bundled during build.
For a tiny development launcher, build:
```powershell
.\scripts\build_launcher_exe.ps1
```
Launcher output:
```text
dist\launcher\seshat-tts.exe
```
This launcher is intentionally small and quick to build. It uses the `.venv` in this project when present, so keep the virtual environment and installed dependencies beside the launcher.
## Run From Source
```powershell
seshat-tts
```
For the fast launcher EXE, run:
```powershell
.\dist\launcher\seshat-tts.exe
```
The launcher expects dependencies in `.venv` or your active Python environment. It does not bundle Python, Torch, Pocket TTS, or Tesseract.
## First-Time Setup
1. Open Seshat TTS.
2. Choose `monitor` or `window` capture mode.
3. Select the monitor or window to watch.
4. Click `Select Region`, then drag over the exact text area to read.
5. Click inside `Read Hotkey` and press the key combo you want. The default is `ctrl+alt+n`.
6. Click inside `Region Hotkey` and press the key combo you want. The default is `ctrl+alt+r`.
7. Click inside `Stop Hotkey` and press the key combo you want. The default is `ctrl+alt+s`.
8. Set `Tesseract` if it was not detected automatically.
9. Choose a voice:
- `default` is fastest and uses a built-in Pocket TTS voice.
- `custom-wav` lets you choose a named WAV, MP3, or cached `.safetensors` reference voice.
10. Adjust `Volume` if the generated voice is too quiet. `100%` is neutral; values above that boost and clip safely.
11. Enable `Local LLM` if you want OCR text cleaned by a local OpenAI-compatible server before TTS.
12. Enable `Use local LLM vision instead of Tesseract OCR` only when your local model endpoint supports image input and you want the LLM to read the selected region directly.
13. Click `Preload TTS` once before playing if you want the first read to be less delayed.
14. Press the read hotkey whenever the selected text should be spoken, or the stop hotkey whenever playback should stop.
Use borderless/windowed mode for games if exclusive fullscreen capture returns stale or blank frames.
## Local LLM
The `Local LLM` panel can use an OpenAI-compatible endpoint in two ways:
- `Route OCR through local OpenAI-compatible LLM` keeps Tesseract as the text extractor, then asks the local model to clean the parsed text before TTS.
- `Use local LLM vision instead of Tesseract OCR` skips Tesseract and sends the selected region image to the local model as a PNG data URL. This requires a vision-capable OpenAI-compatible model endpoint.
Typical values:
```text
Base URL: http://127.0.0.1:8000/v1
API Key: local key or token
Model: the model name exposed by your local server
```
`Load api_key.txt` fills the API key field from a repo-local `api_key.txt` file if present. Treat that file as a secret and do not commit it. Lower timeout and max token values reduce latency; no network or LLM path can be truly zero-latency, but a local endpoint keeps this as short as the model server allows.
`Disable thinking` is enabled by default. It sends common OpenAI-compatible metadata for local reasoning models, including `chat_template_kwargs.enable_thinking=false`, so models that support that switch skip reasoning output and return faster.
## Voice Modes
`default` voice mode is the fastest. Pick a built-in voice such as `alba`, `marius`, `anna`, `vera`, or `george`.
`custom-wav` mode accepts `.wav`, `.mp3`, and cached `.safetensors` voice files. MP3 references are converted once into cached WAV files before Pocket TTS processes them. Use `Manage` beside `Custom Voice` to name voices, save them, and select them from the dropdown.
The first custom-voice run can be slow because Pocket TTS must convert the reference audio into a voice state. Seshat TTS caches that state under:
```text
%USERPROFILE%\.seshat-tts\voices
```
After that cache exists, the `uvx-server` backend sends a reusable local `voice_url` instead of uploading and reprocessing the same audio every time. Named custom voices are stored in:
```text
%USERPROFILE%\.seshat-tts\voice_profiles.json
```
Pocket TTS voice cloning may require Hugging Face access:
1. Request access on [Kyutai's Pocket TTS Hugging Face page](https://huggingface.co/kyutai/pocket-tts).
2. Create a token at [Hugging Face tokens](https://huggingface.co/settings/tokens).
3. Login for `uvx`:
```powershell
uvx hf auth login --force
```
## Build Commands
Fast launcher build, usually under a minute:
```powershell
.\scripts\build_launcher_exe.ps1
```
Output:
```text
dist\launcher\seshat-tts.exe
```
Full dependency-bundled PyInstaller build:
```powershell
.\scripts\build_exe.ps1
```
Output:
```text
dist\seshat-tts.exe
```
Use the fast launcher during development and for local use. Use the portable build when you need to move the app to a machine where Python, Tesseract, and uvx are not installed.
The `python-api` backend is only shown when running from source or the fast launcher. The bundled PyInstaller EXE only exposes `uvx-server`.
## License and Reuse
Seshat TTS is released under the [Scriptriva Public Source License 1.0](LICENSE).
Copyright (c) 2026 Scriptriva Inc.
Commercial use is allowed under the license terms. The license preserves attribution, third-party notices, Scriptriva branding rights, safety restrictions, and restrictions on reusing the licensed work to create or distribute a same-functionality product.
Useful reuse boundaries:
- `src/seshat_tts/capture.py`: monitor/window capture helpers.
- `src/seshat_tts/ocr.py`: OCR preprocessing and text extraction.
- `src/seshat_tts/tts.py`: Pocket TTS server/API playback adapters and stream cancellation.
- `src/seshat_tts/llm.py`: OpenAI-compatible local LLM cleanup step.
- `src/seshat_tts/config.py`: persisted GUI/runtime configuration.
- `src/seshat_tts/region_picker.py`: snipping-tool-style region selection.
Security and privacy considerations for reuse:
- Treat OCR text, API keys, custom voice files, and generated voice caches as user data.
- Do not commit `api_key.txt`, voice samples, `.safetensors` voice caches, or local config files.
- Custom voice cloning should be used only with audio you have permission to use.
- The portable EXE may bundle third-party binaries; keep their notices and license terms intact.
## Third-Party Notices
Seshat TTS uses and/or interfaces with these third-party projects. Each project remains under its own license:
| Component | Purpose | License | Notes |
| --- | --- | --- | --- |
| [Kyutai Pocket TTS](https://github.com/kyutai-labs/pocket-tts) | Local text-to-speech generation and voice cloning | MIT | The Pocket TTS GitHub repository identifies the project as MIT licensed. Model/voice assets may have separate terms; review the linked Hugging Face pages before redistribution. |
| [Tesseract OCR](https://tesseractocr.org/) | OCR engine used to extract text from selected screen regions | Apache License 2.0 | Tesseract is not MIT licensed. Its project site identifies it as Apache 2.0 licensed. |
| [pytesseract](https://github.com/madmaze/pytesseract) | Python wrapper for Tesseract | Apache License 2.0 | Used to invoke the Tesseract executable from Python. |
| [PyInstaller](https://pyinstaller.org/) | Windows executable packaging | GPLv2-or-later with bootloader exception | Used only for building packaged executables. |
| [OpenAI Python SDK](https://github.com/openai/openai-python) | OpenAI-compatible local LLM client | Apache License 2.0 | Used for optional local LLM cleanup through OpenAI-compatible endpoints. |
Packaged builds include [THIRD_PARTY_NOTICES.md](THIRD_PARTY_NOTICES.md), including a link to the [Pocket TTS MIT license](https://github.com/kyutai-labs/pocket-tts/blob/main/LICENSE).
## Tests
```powershell
$env:PYTHONPATH='src'
python -m pytest -q
```
+37
View File
@@ -0,0 +1,37 @@
# Security Policy
## Supported Versions
The `main` branch is the only supported development line until formal releases begin.
## Reporting a Vulnerability
Email support@scriptriva.com with:
- A concise description of the issue.
- Steps to reproduce.
- Affected version or commit.
- Impact and likely abuse path.
- Any proof-of-concept files or screenshots that are safe to share.
Please do not disclose vulnerabilities publicly until maintainers have had time to investigate and coordinate a fix.
## Sensitive Data
Do not commit:
- `api_key.txt`
- Hugging Face or OpenAI-compatible API tokens
- custom voice samples
- cached `.safetensors` voice states
- generated executable artifacts
- local config files
## Security-Relevant Areas
- Screen/window capture.
- OCR text handling.
- Local LLM endpoint configuration.
- Custom voice file handling.
- TTS server startup and subprocess execution.
- Packaged binary contents.
+22
View File
@@ -0,0 +1,22 @@
# Support
For support inquiries email: support@scriptriva.com
## Before Asking
- Check the README setup steps.
- Confirm Tesseract OCR is available or bundled.
- Confirm `uvx` can start Pocket TTS.
- Try the latest build from `main`.
- Include the exact error text shown in the GUI.
## Useful Details
When reporting a problem, include:
- Windows version.
- App version or commit.
- Whether you use monitor or window capture.
- Whether you use default voice or custom voice.
- Whether Local LLM is enabled.
- Steps to reproduce.
+45
View File
@@ -0,0 +1,45 @@
# Third-Party Notices
Seshat TTS uses and/or interfaces with third-party software. These projects retain their own licenses.
## Kyutai Pocket TTS
Repository: https://github.com/kyutai-labs/pocket-tts
License: MIT
License file: https://github.com/kyutai-labs/pocket-tts/blob/main/LICENSE
Pocket TTS is used for local text-to-speech generation and voice cloning through the `uvx-server` backend. The Pocket TTS software license is MIT. Model weights, hosted files, or voice assets may have separate terms; review the upstream model and hosting pages before redistributing those assets.
## Tesseract OCR
Project site: https://tesseractocr.org/
License: Apache License 2.0
Tesseract OCR is used to extract text from selected screen regions.
## pytesseract
Repository: https://github.com/madmaze/pytesseract
License: Apache License 2.0
pytesseract is used as the Python wrapper around the Tesseract executable.
## PyInstaller
Project site: https://pyinstaller.org/
License: GPLv2-or-later with bootloader exception
PyInstaller is used to build the Windows executable.
## OpenAI Python SDK
Repository: https://github.com/openai/openai-python
License: Apache License 2.0
The OpenAI Python SDK is used to call optional OpenAI-compatible local LLM endpoints.
+33
View File
@@ -0,0 +1,33 @@
# Architecture
Seshat TTS is a Windows desktop utility that converts selected on-screen text into streamed speech.
## Runtime Flow
1. User selects a monitor or window.
2. User selects a screen region.
3. Hotkey triggers capture.
4. Tesseract OCR extracts text from the selected region, unless LLM image extraction is enabled.
5. Optional local OpenAI-compatible LLM cleans the OCR text, or a vision-capable local LLM extracts text directly from the captured region image.
6. Pocket TTS streams speech.
7. New hotkey captures stop any active stream and start a fresh one.
8. Stop hotkey cancels active playback without starting another capture.
## Key Modules
- `src/seshat_tts/app.py`: Tk GUI, settings flow, hotkey orchestration.
- `src/seshat_tts/capture.py`: monitor/window capture.
- `src/seshat_tts/region_picker.py`: snipping-tool-style region selection.
- `src/seshat_tts/ocr.py`: image preprocessing and Tesseract OCR.
- `src/seshat_tts/tts.py`: Pocket TTS API/server playback.
- `src/seshat_tts/llm.py`: OpenAI-compatible local LLM cleanup and vision-based image text extraction.
- `src/seshat_tts/config.py`: persisted config loading and migration.
- `src/seshat_tts/voices.py`: named custom voice profiles.
## Packaging
`scripts/build_exe.ps1` builds a portable PyInstaller executable using `seshat-tts-portable.spec`.
The packaged EXE includes first-party resources, project license, third-party notices, bundled OCR files when Tesseract is installed on the build machine, and `uvx.exe` when found.
Pocket TTS runs through `uvx-server` in bundled builds to avoid freezing Torch and its native dependencies into the app.
+57
View File
@@ -0,0 +1,57 @@
[project]
name = "seshat-tts"
version = "0.1.0"
description = "Hotkey-triggered screen OCR to Pocket TTS."
requires-python = ">=3.10,<3.15"
readme = "README.md"
license = { file = "LICENSE" }
authors = [
{ name = "Scriptriva Inc.", email = "support@scriptriva.com" },
]
keywords = ["ocr", "tts", "accessibility", "screen-capture", "pocket-tts"]
classifiers = [
"Development Status :: 3 - Alpha",
"Environment :: Win32 (MS Windows)",
"Intended Audience :: End Users/Desktop",
"Operating System :: Microsoft :: Windows",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.13",
"Topic :: Multimedia :: Sound/Audio :: Speech",
]
dependencies = [
"keyboard>=0.13.5",
"mss>=9.0.1",
"pillow>=10.0.0",
"pytesseract>=0.3.13",
"openai>=1.0.0",
"imageio-ffmpeg>=0.5.1",
"requests>=2.20.0",
"sounddevice>=0.4.6",
"pocket-tts>=2.1.0",
"pywin32>=306; platform_system == 'Windows'",
]
[project.optional-dependencies]
test = [
"pytest>=8.0.0",
]
build = [
"pyinstaller>=6.0.0",
]
[project.scripts]
seshat-tts = "seshat_tts.app:main"
[project.urls]
Homepage = "https://github.com/Scriptriva/seshat-tts"
Repository = "https://github.com/Scriptriva/seshat-tts"
Issues = "https://github.com/Scriptriva/seshat-tts/issues"
Support = "mailto:support@scriptriva.com"
[build-system]
requires = ["hatchling"]
build-backend = "hatchling.build"
[tool.hatch.build.targets.wheel]
packages = ["src/seshat_tts"]
Binary file not shown.

After

Width:  |  Height:  |  Size: 963 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 248 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 32 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.7 MiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 18 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 184 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 2.6 MiB

+55
View File
@@ -0,0 +1,55 @@
param(
[switch]$SkipInstall,
[switch]$OneDir
)
$ErrorActionPreference = "Stop"
$Root = Split-Path -Parent $PSScriptRoot
Set-Location $Root
$DistApp = Join-Path $Root "dist\seshat-tts"
$DistExe = Join-Path $Root "dist\seshat-tts.exe"
$RunningApps = @(Get-Process -Name "seshat-tts" -ErrorAction SilentlyContinue)
foreach ($Process in $RunningApps) {
try {
Stop-Process -Id $Process.Id -Force -ErrorAction Stop
} catch {
& taskkill.exe /PID $Process.Id /T /F | Out-Null
}
}
Start-Sleep -Milliseconds 500
$StillRunning = @(Get-Process -Name "seshat-tts" -ErrorAction SilentlyContinue)
if ($StillRunning.Count -gt 0) {
$Ids = ($StillRunning | ForEach-Object { $_.Id }) -join ", "
throw "Close Seshat TTS before building. Could not stop running process id(s): $Ids"
}
foreach ($Target in @($DistApp, $DistExe)) {
if (Test-Path $Target) {
for ($Attempt = 1; $Attempt -le 5; $Attempt++) {
try {
Remove-Item -LiteralPath $Target -Recurse -Force
break
} catch {
if ($Attempt -eq 5) {
throw
}
Start-Sleep -Seconds 2
}
}
}
}
if (-not $SkipInstall) {
python -m pip install --upgrade pip
python -m pip install -e ".[build,test]"
}
if ($OneDir) {
python -m PyInstaller --clean --noconfirm seshat-tts.spec
Write-Host "Built: $Root\dist\seshat-tts\seshat-tts.exe"
} else {
python -m PyInstaller --clean --noconfirm seshat-tts-portable.spec
Write-Host "Built portable EXE: $Root\dist\seshat-tts.exe"
}
Write-Host "The portable EXE bundles the GUI runtime, bundled OCR files when Tesseract is installed on this build machine, and uvx.exe when found."
+111
View File
@@ -0,0 +1,111 @@
$ErrorActionPreference = "Stop"
$Root = Split-Path -Parent $PSScriptRoot
$OutDir = Join-Path $Root "dist\launcher"
$OutExe = Join-Path $OutDir "seshat-tts.exe"
$BuildDir = Join-Path $Root "build\launcher"
$Source = Join-Path $BuildDir "SeshatTtsLauncher.cs"
$Project = Join-Path $BuildDir "SeshatTtsLauncher.csproj"
$Icon = Join-Path $Root "resources\seshat-tts.ico"
$BuildIcon = Join-Path $BuildDir "seshat-tts.ico"
New-Item -ItemType Directory -Force -Path $OutDir | Out-Null
New-Item -ItemType Directory -Force -Path $BuildDir | Out-Null
Get-ChildItem -Path (Join-Path $BuildDir "*") -File -Include "*.cs", "*.csproj" | Remove-Item -Force
foreach ($GeneratedDir in @("bin", "obj")) {
$Path = Join-Path $BuildDir $GeneratedDir
if (Test-Path $Path) {
Remove-Item -Path $Path -Recurse -Force
}
}
if (Test-Path $Icon) {
Copy-Item -Path $Icon -Destination $BuildIcon -Force
}
@'
using System;
using System.Diagnostics;
using System.IO;
public static class SeshatTtsLauncher
{
public static int Main(string[] args)
{
string exeDir = AppDomain.CurrentDomain.BaseDirectory;
string root = Path.GetFullPath(Path.Combine(exeDir, "..", ".."));
string venvPythonw = Path.Combine(root, ".venv", "Scripts", "pythonw.exe");
string venvPython = Path.Combine(root, ".venv", "Scripts", "python.exe");
string python = File.Exists(venvPythonw) ? venvPythonw :
File.Exists(venvPython) ? venvPython :
"pythonw.exe";
string arguments = "-m seshat_tts";
if (args.Length > 0)
{
arguments += " " + string.Join(" ", Array.ConvertAll(args, Quote));
}
ProcessStartInfo start = new ProcessStartInfo();
start.FileName = python;
start.Arguments = arguments;
start.WorkingDirectory = root;
start.UseShellExecute = false;
start.CreateNoWindow = true;
start.EnvironmentVariables["PYTHONPATH"] = Path.Combine(root, "src");
try
{
Process process = Process.Start(start);
return process == null ? 1 : 0;
}
catch
{
start.FileName = "python.exe";
start.CreateNoWindow = false;
Process process = Process.Start(start);
return process == null ? 1 : 0;
}
}
private static string Quote(string value)
{
if (value.IndexOfAny(new char[] {' ', '\t', '"'}) < 0)
{
return value;
}
return "\"" + value.Replace("\"", "\\\"") + "\"";
}
}
'@ | Set-Content -Path $Source -Encoding UTF8
$compiler = Get-Command csc.exe -ErrorAction SilentlyContinue
if ($compiler) {
$iconArg = if (Test-Path $Icon) { "/win32icon:$Icon" } else { $null }
& $compiler.Source /nologo /target:winexe /out:$OutExe $iconArg $Source
} else {
$dotnet = Get-Command dotnet.exe -ErrorAction SilentlyContinue
if (-not $dotnet) {
throw "No C# compiler found. Install the .NET SDK or add csc.exe to PATH."
}
@'
<Project Sdk="Microsoft.NET.Sdk">
<PropertyGroup>
<OutputType>WinExe</OutputType>
<TargetFramework>net10.0-windows</TargetFramework>
<ImplicitUsings>disable</ImplicitUsings>
<Nullable>disable</Nullable>
<AssemblyName>seshat-tts</AssemblyName>
<ApplicationIcon>seshat-tts.ico</ApplicationIcon>
</PropertyGroup>
<ItemGroup>
<Content Include="seshat-tts.ico" />
</ItemGroup>
</Project>
'@ | Set-Content -Path $Project -Encoding UTF8
dotnet publish $Project -c Release -o $OutDir --nologo
}
Write-Host "Built fast launcher: $OutExe"
Write-Host "This launcher expects dependencies installed in .venv or the active Python environment."
+7
View File
@@ -0,0 +1,7 @@
from __future__ import annotations
from seshat_tts.app import main
if __name__ == "__main__":
main()
+94
View File
@@ -0,0 +1,94 @@
# -*- mode: python ; coding: utf-8 -*-
import os
import shutil
from pathlib import Path
ROOT = Path.cwd()
datas = []
binaries = []
resources = ROOT / "resources"
if resources.exists():
datas.append((str(resources), "resources"))
license_file = ROOT / "LICENSE"
if license_file.exists():
datas.append((str(license_file), "."))
third_party_notices = ROOT / "THIRD_PARTY_NOTICES.md"
if third_party_notices.exists():
datas.append((str(third_party_notices), "."))
tesseract_dir = Path(os.environ.get("SESHAT_TESSERACT_DIR", r"C:\Program Files\Tesseract-OCR"))
if tesseract_dir.exists():
datas.append((str(tesseract_dir), "tesseract"))
for tool_name in ("uvx.exe", "uv.exe"):
tool = shutil.which(tool_name)
if tool:
binaries.append((tool, "tools"))
a = Analysis(
["scripts/pyinstaller_entry.py"],
pathex=[str(ROOT / "src")],
binaries=binaries,
datas=datas,
hiddenimports=[
"pytesseract",
"mss",
"keyboard",
"sounddevice",
"imageio_ffmpeg",
"openai",
"win32gui",
"win32con",
"win32ui",
],
hookspath=[],
hooksconfig={},
runtime_hooks=[],
excludes=[
"pocket_tts",
"torch",
"torchaudio",
"torchvision",
"torchao",
"xformers",
"triton",
"bitsandbytes",
"pandas",
"scipy",
"matplotlib",
"pyarrow",
"numba",
"llvmlite",
"pytest",
"IPython",
"jupyter",
],
noarchive=False,
optimize=0,
)
pyz = PYZ(a.pure)
exe = EXE(
pyz,
a.scripts,
a.binaries,
a.datas,
[],
name="seshat-tts",
debug=False,
bootloader_ignore_signals=False,
strip=False,
upx=True,
console=False,
disable_windowed_traceback=False,
argv_emulation=False,
target_arch=None,
codesign_identity=None,
entitlements_file=None,
icon=str(ROOT / "resources" / "seshat-tts.ico"),
)
+80
View File
@@ -0,0 +1,80 @@
# -*- mode: python ; coding: utf-8 -*-
from pathlib import Path
ROOT = Path.cwd()
a = Analysis(
["scripts/pyinstaller_entry.py"],
pathex=[str(ROOT / "src")],
binaries=[],
datas=(
([(str(ROOT / "resources"), "resources")] if (ROOT / "resources").exists() else [])
+ ([(str(ROOT / "LICENSE"), ".")] if (ROOT / "LICENSE").exists() else [])
+ ([(str(ROOT / "THIRD_PARTY_NOTICES.md"), ".")] if (ROOT / "THIRD_PARTY_NOTICES.md").exists() else [])
),
hiddenimports=[
"pytesseract",
"mss",
"keyboard",
"sounddevice",
"imageio_ffmpeg",
"openai",
"win32gui",
"win32con",
"win32ui",
],
hookspath=[],
hooksconfig={},
runtime_hooks=[],
excludes=[
"pocket_tts",
"torch",
"torchaudio",
"torchvision",
"torchao",
"xformers",
"triton",
"bitsandbytes",
"pandas",
"scipy",
"matplotlib",
"pyarrow",
"numba",
"llvmlite",
"pytest",
"IPython",
"jupyter",
],
noarchive=False,
optimize=0,
)
pyz = PYZ(a.pure)
exe = EXE(
pyz,
a.scripts,
[],
exclude_binaries=True,
name="seshat-tts",
debug=False,
bootloader_ignore_signals=False,
strip=False,
upx=True,
console=False,
disable_windowed_traceback=False,
argv_emulation=False,
target_arch=None,
codesign_identity=None,
entitlements_file=None,
icon=str(ROOT / "resources" / "seshat-tts.ico"),
)
coll = COLLECT(
exe,
a.binaries,
a.datas,
strip=False,
upx=True,
upx_exclude=[],
name="seshat-tts",
)
+5
View File
@@ -0,0 +1,5 @@
"""Hotkey OCR capture to Pocket TTS."""
__all__ = ["__version__"]
__version__ = "0.1.0"
+6
View File
@@ -0,0 +1,6 @@
from .app import main
if __name__ == "__main__":
main()
File diff suppressed because it is too large Load Diff
+116
View File
@@ -0,0 +1,116 @@
from __future__ import annotations
import ctypes
from dataclasses import dataclass
import mss
from PIL import Image
import win32gui
import win32ui
from .config import Rect
@dataclass(frozen=True, slots=True)
class MonitorInfo:
index: int
left: int
top: int
width: int
height: int
@property
def label(self) -> str:
return f"{self.index}: {self.width}x{self.height} at {self.left},{self.top}"
def list_monitors() -> list[MonitorInfo]:
with mss.mss() as sct:
return [
MonitorInfo(
index=index,
left=int(monitor["left"]),
top=int(monitor["top"]),
width=int(monitor["width"]),
height=int(monitor["height"]),
)
for index, monitor in enumerate(sct.monitors)
if index != 0
]
def capture_absolute_region(left: int, top: int, width: int, height: int) -> Image.Image:
with mss.mss() as sct:
grab = {
"left": left,
"top": top,
"width": width,
"height": height,
}
shot = sct.grab(grab)
return Image.frombytes("RGB", shot.size, shot.rgb)
def capture_monitor_region(monitor_index: int, rect: Rect) -> Image.Image:
with mss.mss() as sct:
if monitor_index <= 0 or monitor_index >= len(sct.monitors):
raise ValueError(f"Monitor {monitor_index} is not available.")
monitor = sct.monitors[monitor_index]
return capture_absolute_region(
int(monitor["left"]) + rect.left,
int(monitor["top"]) + rect.top,
rect.width,
rect.height,
)
def capture_window_region(hwnd: int, rect: Rect) -> Image.Image:
image = capture_window(hwnd)
if rect.left < 0 or rect.top < 0 or rect.width <= 0 or rect.height <= 0:
raise ValueError("Capture region must be inside the selected window.")
if rect.left + rect.width > image.width or rect.top + rect.height > image.height:
raise ValueError("Capture region is outside the selected window. Select the region again in window mode.")
return image.crop((rect.left, rect.top, rect.left + rect.width, rect.top + rect.height))
def capture_window(hwnd: int) -> Image.Image:
left, top, right, bottom = win32gui.GetWindowRect(hwnd)
width = right - left
height = bottom - top
if width <= 0 or height <= 0:
raise ValueError("Selected window has no capturable size.")
hwnd_dc = win32gui.GetWindowDC(hwnd)
source_dc = win32ui.CreateDCFromHandle(hwnd_dc)
memory_dc = source_dc.CreateCompatibleDC()
bitmap = win32ui.CreateBitmap()
bitmap.CreateCompatibleBitmap(source_dc, width, height)
memory_dc.SelectObject(bitmap)
try:
result = _print_window(hwnd, memory_dc.GetSafeHdc(), 2)
if result != 1:
result = _print_window(hwnd, memory_dc.GetSafeHdc(), 0)
if result != 1:
raise RuntimeError("PrintWindow failed for the selected window.")
info = bitmap.GetInfo()
bits = bitmap.GetBitmapBits(True)
return Image.frombuffer(
"RGB",
(info["bmWidth"], info["bmHeight"]),
bits,
"raw",
"BGRX",
0,
1,
).copy()
finally:
win32gui.DeleteObject(bitmap.GetHandle())
memory_dc.DeleteDC()
source_dc.DeleteDC()
win32gui.ReleaseDC(hwnd, hwnd_dc)
def _print_window(hwnd: int, hdc: int, flags: int) -> int:
return int(ctypes.windll.user32.PrintWindow(hwnd, hdc, flags))
+125
View File
@@ -0,0 +1,125 @@
from __future__ import annotations
import json
import sys
from dataclasses import asdict, dataclass, field
from pathlib import Path
from typing import Any
from .tesseract import find_tesseract
APP_DIR = Path.home() / ".seshat-tts"
CONFIG_PATH = APP_DIR / "config.json"
@dataclass(slots=True)
class Rect:
left: int = 0
top: int = 25
width: int = 720
height: int = 305
@dataclass(slots=True)
class AppConfig:
capture_mode: str = "monitor"
monitor_index: int = 1
window_title: str = ""
hotkey: str = "ctrl+alt+n"
capture_region_hotkey: str = "ctrl+alt+r"
stop_hotkey: str = "ctrl+alt+s"
dialogue_rect: Rect = field(default_factory=Rect)
tesseract_cmd: str = field(default_factory=find_tesseract)
voice_source: str = "default"
default_voice: str = "alba"
custom_voice_name: str = ""
voice_path: str = ""
language: str = "english"
quantize_tts: bool = False
volume_gain: float = 1.0
tts_backend: str = "uvx-server"
tts_host: str = "localhost"
tts_port: int = 8000
llm_enabled: bool = False
llm_base_url: str = "http://127.0.0.1:8000/v1"
llm_api_key: str = ""
llm_model: str = "current"
llm_timeout: float = 5.0
llm_max_tokens: int = 256
llm_disable_thinking: bool = True
llm_image_extraction: bool = False
llm_system_prompt: str = (
"Clean OCR text for text-to-speech. Return only the corrected text. "
"Do not explain, add commentary, summarize, or change the meaning."
)
last_text: str = ""
def _rect_from_dict(value: dict[str, Any] | None) -> Rect:
if not value:
return Rect()
return Rect(**{field: int(value.get(field, getattr(Rect(), field))) for field in Rect.__dataclass_fields__})
def _clean_last_text(value: Any) -> str:
lines = str(value or "").splitlines()
cleaned = [
line
for line in lines
if not line.strip().casefold().startswith(("capture region:", "text region:"))
]
return "\n".join(cleaned).strip()
def _tesseract_from_config(value: Any) -> str:
detected = find_tesseract()
if getattr(sys, "frozen", False) and detected:
return detected
return str(value or detected)
def load_config(path: Path = CONFIG_PATH) -> AppConfig:
if not path.exists():
return AppConfig()
data = json.loads(path.read_text(encoding="utf-8"))
return AppConfig(
capture_mode=str(data.get("capture_mode", "monitor")),
monitor_index=int(data.get("monitor_index", 1)),
window_title=str(data.get("window_title", "")),
hotkey=str(data.get("hotkey", "ctrl+alt+n")),
capture_region_hotkey=str(data.get("capture_region_hotkey", "ctrl+alt+r")),
stop_hotkey=str(data.get("stop_hotkey", "ctrl+alt+s")),
dialogue_rect=_rect_from_dict(data.get("dialogue_rect")),
tesseract_cmd=_tesseract_from_config(data.get("tesseract_cmd")),
voice_source=str(data.get("voice_source", "default")),
default_voice=str(data.get("default_voice", "alba")),
custom_voice_name=str(data.get("custom_voice_name", "")),
voice_path=str(data.get("voice_path", "")),
language="english",
quantize_tts=bool(data.get("quantize_tts", False)),
volume_gain=float(data.get("volume_gain", 1.0)),
tts_backend=str(data.get("tts_backend", "uvx-server")),
tts_host=str(data.get("tts_host", "localhost")),
tts_port=int(data.get("tts_port", 8000)),
llm_enabled=bool(data.get("llm_enabled", False)),
llm_base_url=str(data.get("llm_base_url", "http://127.0.0.1:8000/v1")),
llm_api_key=str(data.get("llm_api_key", "")),
llm_model=str(data.get("llm_model", "unsloth")),
llm_timeout=float(data.get("llm_timeout", 5.0)),
llm_max_tokens=int(data.get("llm_max_tokens", 256)),
llm_disable_thinking=bool(data.get("llm_disable_thinking", True)),
llm_image_extraction=bool(data.get("llm_image_extraction", False)),
llm_system_prompt=str(
data.get(
"llm_system_prompt",
AppConfig.__dataclass_fields__["llm_system_prompt"].default,
)
),
last_text=_clean_last_text(data.get("last_text", "")),
)
def save_config(config: AppConfig, path: Path = CONFIG_PATH) -> None:
path.parent.mkdir(parents=True, exist_ok=True)
path.write_text(json.dumps(asdict(config), indent=2), encoding="utf-8")
+30
View File
@@ -0,0 +1,30 @@
from __future__ import annotations
from collections.abc import Callable
import keyboard
class HotkeyManager:
def __init__(self) -> None:
self._handles: dict[str, object] = {}
def register(self, name: str, hotkey: str, callback: Callable[[], None]) -> None:
self.unregister(name)
if not hotkey.strip():
return
self._handles[name] = keyboard.add_hotkey(hotkey, callback, suppress=False, trigger_on_release=False)
def unregister(self, name: str | None = None) -> None:
if name is not None:
handle = self._handles.pop(name, None)
if handle is not None:
keyboard.remove_hotkey(handle)
return
for handle in self._handles.values():
keyboard.remove_hotkey(handle)
self._handles.clear()
def listen_for_hotkey() -> str:
return keyboard.read_hotkey(suppress=False)
+143
View File
@@ -0,0 +1,143 @@
from __future__ import annotations
import base64
from io import BytesIO
from pathlib import Path
from typing import Protocol
from PIL import Image
DEFAULT_API_KEY_PATH = Path.home() / ".seshat-tts" / "llm_api_key.txt"
IMAGE_EXTRACTION_SYSTEM_PROMPT = (
"Extract only the visible readable text from the supplied image for text-to-speech. "
"Preserve the original wording and sentence order. Do not describe the image, "
"do not add commentary, and do not include UI labels unless they are part of the text to read."
)
IMAGE_EXTRACTION_USER_PROMPT = "Read the text in this selected screen region and return only that text."
class _ChatCompletions(Protocol):
def create(self, **kwargs: object) -> object: ...
class _Chat(Protocol):
completions: _ChatCompletions
class _OpenAIClient(Protocol):
chat: _Chat
def load_api_key_file(path: Path = DEFAULT_API_KEY_PATH) -> str:
if not path.exists():
return ""
return path.read_text(encoding="utf-8").strip()
def process_text_with_llm(
text: str,
*,
enabled: bool,
base_url: str,
api_key: str,
model: str,
system_prompt: str,
timeout: float = 5.0,
max_tokens: int = 256,
disable_thinking: bool = True,
client: _OpenAIClient | None = None,
) -> str:
text = text.strip()
if not enabled or not text:
return text
if client is None:
from openai import OpenAI
client = OpenAI(
api_key=api_key.strip() or "local",
base_url=base_url.strip(),
timeout=max(0.1, float(timeout)),
)
request: dict[str, object] = {
"model": model.strip(),
"messages": [
{"role": "system", "content": system_prompt.strip()},
{"role": "user", "content": text},
],
"temperature": 0,
"max_tokens": max(1, int(max_tokens)),
"stream": False,
}
if disable_thinking:
request["extra_body"] = {
"chat_template_kwargs": {"enable_thinking": False},
"enable_thinking": False,
"reasoning_effort": "none",
}
response = client.chat.completions.create(**request)
content = response.choices[0].message.content
return str(content or "").strip() or text
def process_image_with_llm(
image: Image.Image,
*,
base_url: str,
api_key: str,
model: str,
timeout: float = 5.0,
max_tokens: int = 256,
disable_thinking: bool = True,
client: _OpenAIClient | None = None,
) -> str:
if client is None:
from openai import OpenAI
client = OpenAI(
api_key=api_key.strip() or "local",
base_url=base_url.strip(),
timeout=max(0.1, float(timeout)),
)
request: dict[str, object] = {
"model": model.strip(),
"messages": [
{"role": "system", "content": IMAGE_EXTRACTION_SYSTEM_PROMPT},
{
"role": "user",
"content": [
{"type": "text", "text": IMAGE_EXTRACTION_USER_PROMPT},
{
"type": "image_url",
"image_url": {
"url": f"data:image/png;base64,{_image_to_base64_png(image)}",
"detail": "high",
},
},
],
},
],
"temperature": 0,
"max_tokens": max(1, int(max_tokens)),
"stream": False,
}
if disable_thinking:
request["extra_body"] = {
"chat_template_kwargs": {"enable_thinking": False},
"enable_thinking": False,
"reasoning_effort": "none",
}
response = client.chat.completions.create(**request)
content = response.choices[0].message.content
return str(content or "").strip()
def _image_to_base64_png(image: Image.Image) -> str:
buffer = BytesIO()
image.convert("RGB").save(buffer, format="PNG")
return base64.b64encode(buffer.getvalue()).decode("ascii")
+60
View File
@@ -0,0 +1,60 @@
from __future__ import annotations
import os
from pathlib import Path
from PIL import Image, ImageEnhance, ImageFilter, ImageOps
from .tesseract import tesseract_help_message
def preprocess_for_ocr(image: Image.Image) -> Image.Image:
image = ImageOps.expand(image, border=12, fill=(0, 0, 0))
gray = ImageOps.grayscale(image)
enlarged = gray.resize((gray.width * 2, gray.height * 2), Image.Resampling.LANCZOS)
contrast = ImageEnhance.Contrast(enlarged).enhance(2.2)
sharpened = contrast.filter(ImageFilter.SHARPEN)
return sharpened.point(lambda pixel: 255 if pixel > 145 else 0)
def image_to_lines(image: Image.Image, tesseract_cmd: str = "") -> list[str]:
import pytesseract
from pytesseract import TesseractNotFoundError
if tesseract_cmd:
pytesseract.pytesseract.tesseract_cmd = tesseract_cmd
tessdata = _tessdata_dir(tesseract_cmd)
if tessdata is not None:
os.environ["TESSDATA_PREFIX"] = str(tessdata)
config = "--psm 6 --oem 3"
try:
text = pytesseract.image_to_string(image, lang="eng", config=config)
except TesseractNotFoundError as exc:
raise RuntimeError(tesseract_help_message()) from exc
return [normalize_line(line) for line in text.splitlines() if normalize_line(line)]
def normalize_line(line: str) -> str:
import re
line = re.sub(r"\s+", " ", line).strip()
line = line.replace("", '"').replace("", '"').replace("", "'").replace("", "'")
return line
def extract_text_from_lines(lines: list[str]) -> str:
return " ".join(lines).strip()
def extract_ocr_text(image: Image.Image, tesseract_cmd: str = "") -> str:
processed = preprocess_for_ocr(image)
return extract_text_from_lines(image_to_lines(processed, tesseract_cmd))
def _tessdata_dir(tesseract_cmd: str) -> Path | None:
if not tesseract_cmd:
return None
tessdata = Path(tesseract_cmd).resolve().parent / "tessdata"
if tessdata.exists():
return tessdata
return None
+100
View File
@@ -0,0 +1,100 @@
from __future__ import annotations
import tkinter as tk
from collections.abc import Callable
from PIL import ImageEnhance, ImageTk
from .capture import capture_absolute_region
from .config import Rect
class RegionPicker(tk.Toplevel):
def __init__(
self,
parent: tk.Tk,
bounds: Rect,
on_selected: Callable[[Rect], None],
) -> None:
super().__init__(parent)
self._bounds = bounds
self._on_selected = on_selected
self._start_x = 0
self._start_y = 0
self._rect_id: int | None = None
self._label_id: int | None = None
self.overrideredirect(True)
self.attributes("-topmost", True)
self.geometry(f"{bounds.width}x{bounds.height}{bounds.left:+d}{bounds.top:+d}")
self.configure(cursor="crosshair")
screenshot = capture_absolute_region(bounds.left, bounds.top, bounds.width, bounds.height)
dimmed = ImageEnhance.Brightness(screenshot).enhance(0.55)
self._image = ImageTk.PhotoImage(dimmed)
self.canvas = tk.Canvas(self, bg="#050608", highlightthickness=0, cursor="crosshair")
self.canvas.pack(fill=tk.BOTH, expand=True)
self.canvas.create_image(0, 0, image=self._image, anchor=tk.NW)
self.canvas.create_text(
18,
18,
text="Drag to select. Esc cancels.",
fill="#f4f7fb",
anchor=tk.NW,
font=("Segoe UI", 12, "bold"),
)
self.canvas.bind("<ButtonPress-1>", self._on_press)
self.canvas.bind("<B1-Motion>", self._on_drag)
self.canvas.bind("<ButtonRelease-1>", self._on_release)
self.bind("<Escape>", lambda _event: self.destroy())
self.focus_force()
self.grab_set()
def _on_press(self, event: tk.Event) -> None:
self._start_x = int(event.x)
self._start_y = int(event.y)
self._rect_id = self.canvas.create_rectangle(
self._start_x,
self._start_y,
self._start_x,
self._start_y,
outline="#ff365f",
width=3,
)
def _on_drag(self, event: tk.Event) -> None:
if self._rect_id is not None:
x1, x2 = sorted((self._start_x, int(event.x)))
y1, y2 = sorted((self._start_y, int(event.y)))
self.canvas.coords(self._rect_id, x1, y1, x2, y2)
label = f"{x2 - x1} x {y2 - y1}"
if self._label_id is None:
self._label_id = self.canvas.create_text(
x1 + 8,
max(12, y1 - 18),
text=label,
fill="#f4f7fb",
anchor=tk.W,
font=("Segoe UI", 10, "bold"),
)
else:
self.canvas.coords(self._label_id, x1 + 8, max(12, y1 - 18))
self.canvas.itemconfigure(self._label_id, text=label)
def _on_release(self, event: tk.Event) -> None:
x1, x2 = sorted((self._start_x, int(event.x)))
y1, y2 = sorted((self._start_y, int(event.y)))
self.grab_release()
self.destroy()
if x2 - x1 < 4 or y2 - y1 < 4:
return
self._on_selected(
Rect(
left=self._bounds.left + x1,
top=self._bounds.top + y1,
width=x2 - x1,
height=y2 - y1,
)
)
+9
View File
@@ -0,0 +1,9 @@
from __future__ import annotations
import sys
from pathlib import Path
def resource_path(relative_path: str) -> Path:
base = Path(getattr(sys, "_MEIPASS", Path(__file__).resolve().parents[2]))
return base / relative_path
+33
View File
@@ -0,0 +1,33 @@
from __future__ import annotations
import shutil
from pathlib import Path
from .resources import resource_path
COMMON_TESSERACT_PATHS = (
Path(r"C:\Program Files\Tesseract-OCR\tesseract.exe"),
Path(r"C:\Program Files (x86)\Tesseract-OCR\tesseract.exe"),
)
def find_tesseract() -> str:
bundled = resource_path("tesseract/tesseract.exe")
if bundled.exists():
return str(bundled)
from_path = shutil.which("tesseract")
if from_path:
return from_path
for path in COMMON_TESSERACT_PATHS:
if path.exists():
return str(path)
return ""
def tesseract_help_message() -> str:
return (
"Tesseract OCR is not installed or the executable is not configured. "
"Install it with `winget install UB-Mannheim.TesseractOCR`, then restart the app, "
"or select tesseract.exe in the GUI."
)
+576
View File
@@ -0,0 +1,576 @@
from __future__ import annotations
import collections
import functools
import hashlib
import http.server
import importlib
import os
import queue
import shutil
import subprocess
import threading
import time
from pathlib import Path
from urllib.parse import quote, urljoin
import numpy as np
import requests
import sounddevice as sd
from .resources import resource_path
from .voices import safe_voice_slug
VOICE_CACHE_DIR = Path.home() / ".seshat-tts" / "voices"
class PocketTTSStreamer:
def __init__(
self,
voice_path: str | Path,
language: str = "english",
quantize: bool = False,
voice_source: str = "default",
default_voice: str = "alba",
custom_voice_name: str = "",
volume_gain: float = 1.0,
) -> None:
self.voice_path = str(voice_path)
self.language = language
self.quantize = quantize
self.voice_source = voice_source
self.default_voice = default_voice
self.custom_voice_name = custom_voice_name
self.volume_gain = _clamp_volume_gain(volume_gain)
self._model = None
self._voice_state = None
self._lock = threading.Lock()
self._cancel_lock = threading.Lock()
self._cancel_event = threading.Event()
self._status_queue: queue.Queue[str] = queue.Queue()
@property
def status_queue(self) -> queue.Queue[str]:
return self._status_queue
def speak_async(self, text: str) -> None:
text = text.strip()
if not text:
return
cancel_event = self._begin_new_stream()
threading.Thread(target=self._speak, args=(text, cancel_event), daemon=True).start()
def preload_async(self) -> None:
threading.Thread(target=self._preload, daemon=True).start()
def test_async(self) -> None:
self.speak_async("This is a Pocket TTS test.")
def close(self) -> None:
self.stop()
def stop(self) -> None:
with self._cancel_lock:
self._cancel_event.set()
def _begin_new_stream(self) -> threading.Event:
with self._cancel_lock:
self._cancel_event.set()
self._cancel_event = threading.Event()
return self._cancel_event
def _preload(self) -> None:
with self._lock:
try:
self._load()
except Exception as exc:
self._status_queue.put(f"TTS preload error: {exc}")
def _load(self) -> None:
if self._model is not None and self._voice_state is not None:
return
self._status_queue.put("Loading Pocket TTS model...")
try:
pocket_tts = importlib.import_module("pocket_tts")
tts_model = getattr(pocket_tts, "TTSModel")
except (ImportError, OSError) as exc:
raise RuntimeError(
"Pocket TTS failed to load through the in-process Python API. "
"Use the uvx-server backend, especially from the bundled EXE."
) from exc
try:
self._model = tts_model.load_model(language=self.language, quantize=self.quantize)
except OSError as exc:
raise RuntimeError(
"Pocket TTS/Torch DLL initialization failed in the in-process Python API. "
"Use the uvx-server backend instead."
) from exc
voice = self.default_voice if self.voice_source == "default" else self._custom_voice_path()
self._status_queue.put(f"Loading voice: {voice}")
self._voice_state = self._model.get_state_for_audio_prompt(voice)
self._status_queue.put("Pocket TTS ready.")
def _custom_voice_path(self) -> str:
if not self.voice_path.strip():
raise ValueError("Select a WAV or MP3 file, or change Voice Source to default.")
return str(_prepared_audio_prompt_path(self.voice_path, self.language, self._status_queue))
def _speak(self, text: str, cancel_event: threading.Event) -> None:
with self._lock:
try:
self._load()
if cancel_event.is_set():
self._status_queue.put("Stopped previous TTS stream.")
return
assert self._model is not None
assert self._voice_state is not None
sample_rate = int(self._model.sample_rate)
self._status_queue.put("Speaking OCR text...")
with sd.OutputStream(samplerate=sample_rate, channels=1, dtype="float32") as stream:
for chunk in self._model.generate_audio_stream(self._voice_state, text):
if cancel_event.is_set():
self._status_queue.put("Stopped previous TTS stream.")
return
audio = chunk.detach().cpu().numpy()
audio = np.asarray(audio, dtype=np.float32).reshape(-1)
if audio.size:
stream.write(_apply_volume_gain(audio, self.volume_gain))
self._status_queue.put("Done.")
except Exception as exc:
self._status_queue.put(f"TTS error: {exc}")
class UvxPocketTTSServer:
def __init__(
self,
voice_path: str | Path,
language: str = "english",
quantize: bool = False,
host: str = "localhost",
port: int = 8000,
voice_source: str = "default",
default_voice: str = "alba",
custom_voice_name: str = "",
volume_gain: float = 1.0,
) -> None:
self.voice_path = str(voice_path)
self.language = language
self.quantize = quantize
self.host = host
self.port = port
self.voice_source = voice_source
self.default_voice = default_voice
self.custom_voice_name = custom_voice_name
self.volume_gain = _clamp_volume_gain(volume_gain)
self._process: subprocess.Popen[str] | None = None
self._lock = threading.Lock()
self._speak_lock = threading.Lock()
self._cancel_lock = threading.Lock()
self._cancel_event = threading.Event()
self._active_response: requests.Response | None = None
self._server_output: collections.deque[str] = collections.deque(maxlen=80)
self._status_queue: queue.Queue[str] = queue.Queue()
@property
def status_queue(self) -> queue.Queue[str]:
return self._status_queue
@property
def base_url(self) -> str:
return f"http://{self.host}:{self.port}/"
def preload_async(self) -> None:
threading.Thread(target=self._ensure_server, daemon=True).start()
def speak_async(self, text: str) -> None:
text = text.strip()
if not text:
return
cancel_event = self._begin_new_stream()
threading.Thread(target=self._speak, args=(text, cancel_event), daemon=True).start()
def test_async(self) -> None:
self.speak_async("This is a Pocket TTS test.")
def close(self) -> None:
self.stop()
if self._process and self._process.poll() is None:
self._process.terminate()
def stop(self) -> None:
with self._cancel_lock:
self._cancel_event.set()
if self._active_response is not None:
self._active_response.close()
def _begin_new_stream(self) -> threading.Event:
with self._cancel_lock:
self._cancel_event.set()
if self._active_response is not None:
self._active_response.close()
self._cancel_event = threading.Event()
return self._cancel_event
def _is_healthy(self) -> bool:
try:
response = requests.get(urljoin(self.base_url, "health"), timeout=2)
return response.ok
except requests.RequestException:
return False
def _ensure_server(self) -> None:
with self._lock:
if self._is_healthy():
self._status_queue.put("Pocket TTS server ready.")
return
if self._process is None or self._process.poll() is not None:
uvx = _find_uvx()
command = [
str(uvx),
"pocket-tts",
"serve",
"--host",
self.host,
"--port",
str(self.port),
"--language",
self.language,
]
if self.quantize:
command.append("--quantize")
self._server_output.clear()
self._status_queue.put(f"Starting Pocket TTS server with {uvx}...")
self._process = subprocess.Popen(
command,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
text=True,
encoding="utf-8",
errors="replace",
env=_clean_subprocess_env(),
cwd=str(Path.home()),
creationflags=_subprocess_creationflags(),
)
threading.Thread(target=self._read_server_output, daemon=True).start()
deadline = time.monotonic() + 900
while time.monotonic() < deadline:
if self._is_healthy():
self._status_queue.put("Pocket TTS server ready.")
return
if self._process and self._process.poll() is not None:
output = self._server_output_tail()
detail = f"\n{output}" if output else " No server output was captured."
raise RuntimeError(f"Pocket TTS server exited with code {self._process.returncode}.{detail}")
time.sleep(1)
raise TimeoutError("Pocket TTS server did not become ready before timeout.")
def _read_server_output(self) -> None:
process = self._process
if process is None or process.stdout is None:
return
try:
for line in process.stdout:
line = line.strip()
if line:
self._server_output.append(line)
except Exception as exc:
self._server_output.append(f"Failed to read server output: {exc}")
def _server_output_tail(self) -> str:
if not self._server_output:
return ""
return "\n".join(list(self._server_output)[-12:])
def _speak(self, text: str, cancel_event: threading.Event) -> None:
with self._speak_lock:
if cancel_event.is_set():
self._status_queue.put("Stopped previous TTS stream.")
return
try:
self._ensure_server()
if cancel_event.is_set():
self._status_queue.put("Stopped previous TTS stream.")
return
self._status_queue.put("Requesting Pocket TTS audio...")
if self.voice_source == "default":
response = requests.post(
urljoin(self.base_url, "tts"),
data={"text": text, "voice_url": self.default_voice},
stream=True,
timeout=900,
)
else:
voice_url = self._custom_voice_url()
response = requests.post(
urljoin(self.base_url, "tts"),
data={"text": text, "voice_url": voice_url},
stream=True,
timeout=900,
)
with self._cancel_lock:
self._active_response = response
response.raise_for_status()
self._play_streaming_wav(response, cancel_event)
if not cancel_event.is_set():
self._status_queue.put("Done.")
except requests.RequestException as exc:
if cancel_event.is_set():
self._status_queue.put("Stopped previous TTS stream.")
else:
self._status_queue.put(f"TTS error: {exc}")
except Exception as exc:
self._status_queue.put(f"TTS error: {exc}")
finally:
with self._cancel_lock:
self._active_response = None
def _custom_voice_path(self) -> str:
if not self.voice_path.strip():
raise ValueError("Select a WAV or MP3 file, or change Voice Source to default.")
return self.voice_path
def _custom_voice_url(self) -> str:
voice_state = _cached_voice_state_path(
self._custom_voice_path(),
self.language,
self._status_queue,
self.custom_voice_name,
)
return _voice_state_server.url_for(voice_state)
def _play_streaming_wav(self, response: requests.Response, cancel_event: threading.Event) -> None:
buffer = bytearray()
stream: sd.OutputStream | None = None
sample_width = 0
channels = 0
try:
for chunk in response.iter_content(chunk_size=16384):
if cancel_event.is_set():
response.close()
self._status_queue.put("Stopped previous TTS stream.")
return
if not chunk:
continue
buffer.extend(chunk)
if stream is None:
header_end = _find_wav_data_offset(buffer)
if header_end is None:
continue
channels, sample_rate, sample_width = _read_wav_format(buffer)
stream = sd.OutputStream(samplerate=sample_rate, channels=channels, dtype="float32")
stream.start()
del buffer[:header_end]
self._status_queue.put("Streaming Pocket TTS audio...")
frame_size = sample_width * channels
usable = len(buffer) - (len(buffer) % frame_size)
if usable <= 0:
continue
pcm = bytes(buffer[:usable])
del buffer[:usable]
audio = _pcm_to_float32(pcm, sample_width, channels)
if audio.size:
stream.write(_apply_volume_gain(audio, self.volume_gain))
finally:
if stream is not None:
stream.stop()
stream.close()
class _QuietStaticFileHandler(http.server.SimpleHTTPRequestHandler):
def log_message(self, _format: str, *args: object) -> None:
return
class _VoiceStateServer:
def __init__(self) -> None:
self._lock = threading.Lock()
self._server: http.server.ThreadingHTTPServer | None = None
self._thread: threading.Thread | None = None
def url_for(self, path: Path) -> str:
with self._lock:
VOICE_CACHE_DIR.mkdir(parents=True, exist_ok=True)
if self._server is None:
handler = functools.partial(_QuietStaticFileHandler, directory=str(VOICE_CACHE_DIR))
self._server = http.server.ThreadingHTTPServer(("127.0.0.1", 0), handler)
self._thread = threading.Thread(target=self._server.serve_forever, daemon=True)
self._thread.start()
port = self._server.server_address[1]
return f"http://127.0.0.1:{port}/{quote(path.name)}"
_voice_state_server = _VoiceStateServer()
def _cached_voice_state_path(
source_path: str,
language: str,
status_queue: queue.Queue[str],
voice_name: str = "",
) -> Path:
source = Path(source_path)
if not source.exists():
raise FileNotFoundError(f"Voice file not found: {source}")
VOICE_CACHE_DIR.mkdir(parents=True, exist_ok=True)
stat = source.stat()
digest = hashlib.sha256(
f"{source.resolve()}|{stat.st_mtime_ns}|{stat.st_size}|{language}".encode("utf-8")
).hexdigest()[:24]
prefix = safe_voice_slug(voice_name) if voice_name.strip() else source.stem
target = VOICE_CACHE_DIR / f"{safe_voice_slug(prefix)}-{digest}.safetensors"
if source.suffix.casefold() == ".safetensors":
if not target.exists():
shutil.copy2(source, target)
status_queue.put("Using cached custom voice state.")
return target
if target.exists():
status_queue.put("Using cached custom voice state.")
return target
prompt_source = _prepared_audio_prompt_path(source, language, status_queue, digest)
status_queue.put("Exporting custom voice cache; first run can take a while.")
command = [
str(_find_uvx()),
"pocket-tts",
"export-voice",
str(prompt_source),
str(target),
"--language",
language,
"--quiet",
]
subprocess.run(command, check=True, env=_clean_subprocess_env(), creationflags=_subprocess_creationflags())
status_queue.put("Custom voice cache ready.")
return target
def _prepared_audio_prompt_path(
source_path: str | Path,
language: str,
status_queue: queue.Queue[str],
digest: str | None = None,
) -> Path:
source = Path(source_path)
if source.suffix.casefold() != ".mp3":
return source
VOICE_CACHE_DIR.mkdir(parents=True, exist_ok=True)
if digest is None:
stat = source.stat()
digest = hashlib.sha256(
f"{source.resolve()}|{stat.st_mtime_ns}|{stat.st_size}|{language}".encode("utf-8")
).hexdigest()[:24]
target = VOICE_CACHE_DIR / f"{safe_voice_slug(source.stem)}-{digest}.wav"
if target.exists():
status_queue.put("Using cached WAV conversion for MP3 voice.")
return target
status_queue.put("Converting MP3 voice reference to WAV...")
_convert_mp3_to_wav(source, target)
status_queue.put("MP3 voice conversion ready.")
return target
def _convert_mp3_to_wav(source: Path, target: Path) -> None:
try:
import imageio_ffmpeg
except ImportError as exc:
raise RuntimeError("MP3 custom voices require imageio-ffmpeg. Reinstall Seshat TTS dependencies.") from exc
command = [
imageio_ffmpeg.get_ffmpeg_exe(),
"-hide_banner",
"-loglevel",
"error",
"-y",
"-i",
str(source),
"-vn",
"-ac",
"1",
"-ar",
"24000",
"-sample_fmt",
"s16",
str(target),
]
subprocess.run(command, check=True, env=_clean_subprocess_env(), creationflags=_subprocess_creationflags())
def _find_wav_data_offset(data: bytearray) -> int | None:
marker = data.find(b"data")
if marker < 0 or len(data) < marker + 8:
return None
return marker + 8
def _read_wav_format(data: bytearray) -> tuple[int, int, int]:
if len(data) < 36 or data[:4] != b"RIFF" or data[8:12] != b"WAVE":
raise ValueError("Response is not a WAV stream.")
fmt = data.find(b"fmt ")
if fmt < 0 or len(data) < fmt + 24:
raise ValueError("WAV stream is missing fmt chunk.")
channels = int.from_bytes(data[fmt + 10 : fmt + 12], "little")
sample_rate = int.from_bytes(data[fmt + 12 : fmt + 16], "little")
bits_per_sample = int.from_bytes(data[fmt + 22 : fmt + 24], "little")
return channels, sample_rate, bits_per_sample // 8
def _pcm_to_float32(pcm: bytes, sample_width: int, channels: int) -> np.ndarray:
if sample_width == 2:
audio = np.frombuffer(pcm, dtype=np.int16).astype(np.float32) / 32768.0
elif sample_width == 4:
audio = np.frombuffer(pcm, dtype=np.int32).astype(np.float32) / 2147483648.0
else:
raise ValueError(f"Unsupported WAV sample width: {sample_width}")
if channels > 1:
return audio.reshape(-1, channels)
return audio.reshape(-1, 1)
def _clamp_volume_gain(value: float) -> float:
return max(0.0, min(float(value), 3.0))
def _apply_volume_gain(audio: np.ndarray, volume_gain: float) -> np.ndarray:
gain = _clamp_volume_gain(volume_gain)
if gain == 1.0:
return audio
return np.clip(audio * gain, -1.0, 1.0).astype(np.float32, copy=False)
def _find_uvx() -> Path:
bundled = resource_path("tools/uvx.exe")
if bundled.exists():
return bundled
found = shutil.which("uvx")
if found:
return Path(found)
candidates = [
Path.home() / ".local" / "bin" / "uvx.exe",
Path.home() / ".cargo" / "bin" / "uvx.exe",
]
for candidate in candidates:
if candidate.exists():
return candidate
raise FileNotFoundError("uvx.exe was not found on PATH. Install uv or add uvx.exe to PATH.")
def _clean_subprocess_env() -> dict[str, str]:
env = os.environ.copy()
for key in list(env):
if key.startswith("_PYI") or key.startswith("PYINSTALLER"):
env.pop(key, None)
env.pop("PYTHONHOME", None)
env.pop("PYTHONPATH", None)
user_bin = Path.home() / ".local" / "bin"
if user_bin.exists():
env["PATH"] = str(user_bin) + os.pathsep + env.get("PATH", "")
return env
def _subprocess_creationflags() -> int:
if os.name != "nt":
return 0
return int(getattr(subprocess, "CREATE_NO_WINDOW", 0))
+55
View File
@@ -0,0 +1,55 @@
from __future__ import annotations
import json
import re
from dataclasses import asdict, dataclass
from pathlib import Path
from typing import Any
from .config import APP_DIR
VOICE_PROFILES_PATH = APP_DIR / "voice_profiles.json"
@dataclass(slots=True)
class VoiceProfile:
name: str
path: str
def safe_voice_slug(name: str) -> str:
slug = re.sub(r"[^a-zA-Z0-9._-]+", "-", name.strip()).strip("-._")
return slug or "custom-voice"
def load_voice_profiles(path: Path = VOICE_PROFILES_PATH) -> list[VoiceProfile]:
if not path.exists():
return []
data = json.loads(path.read_text(encoding="utf-8"))
profiles: list[VoiceProfile] = []
for item in data if isinstance(data, list) else []:
if not isinstance(item, dict):
continue
name = str(item.get("name", "")).strip()
voice_path = str(item.get("path", "")).strip()
if name and voice_path:
profiles.append(VoiceProfile(name=name, path=voice_path))
return profiles
def save_voice_profiles(profiles: list[VoiceProfile], path: Path = VOICE_PROFILES_PATH) -> None:
path.parent.mkdir(parents=True, exist_ok=True)
path.write_text(json.dumps([asdict(profile) for profile in profiles], indent=2), encoding="utf-8")
def upsert_voice_profile(profile: VoiceProfile, path: Path = VOICE_PROFILES_PATH) -> list[VoiceProfile]:
profiles = [item for item in load_voice_profiles(path) if item.name != profile.name]
profiles.append(profile)
profiles.sort(key=lambda item: item.name.casefold())
save_voice_profiles(profiles, path)
return profiles
def voice_profile_by_name(name: str, profiles: list[VoiceProfile]) -> VoiceProfile | None:
return next((profile for profile in profiles if profile.name == name), None)
+70
View File
@@ -0,0 +1,70 @@
from __future__ import annotations
from dataclasses import dataclass
import win32gui
@dataclass(frozen=True, slots=True)
class WindowInfo:
hwnd: int
title: str
left: int
top: int
right: int
bottom: int
@property
def width(self) -> int:
return self.right - self.left
@property
def height(self) -> int:
return self.bottom - self.top
@property
def label(self) -> str:
return f"{self.title} [{self.width}x{self.height} at {self.left},{self.top}]"
def _is_candidate(hwnd: int) -> bool:
if not win32gui.IsWindowVisible(hwnd):
return False
title = win32gui.GetWindowText(hwnd).strip()
if not title:
return False
left, top, right, bottom = win32gui.GetWindowRect(hwnd)
return (right - left) > 50 and (bottom - top) > 50
def list_visible_windows() -> list[WindowInfo]:
windows: list[WindowInfo] = []
def callback(hwnd: int, _extra: object) -> None:
if _is_candidate(hwnd):
left, top, right, bottom = win32gui.GetWindowRect(hwnd)
windows.append(
WindowInfo(
hwnd=hwnd,
title=win32gui.GetWindowText(hwnd).strip(),
left=left,
top=top,
right=right,
bottom=bottom,
)
)
win32gui.EnumWindows(callback, None)
windows.sort(key=lambda item: item.title.casefold())
return windows
def find_window_by_title(title: str) -> WindowInfo | None:
title = title.strip()
if not title:
return None
for window in list_visible_windows():
if window.title == title:
return window
needle = title.casefold()
return next((window for window in list_visible_windows() if needle in window.title.casefold()), None)
+80
View File
@@ -0,0 +1,80 @@
from pathlib import Path
import json
from seshat_tts.config import AppConfig, Rect, load_config, save_config
def test_config_roundtrip(tmp_path: Path) -> None:
path = tmp_path / "config.json"
config = AppConfig(
monitor_index=2,
hotkey="ctrl+shift+d",
capture_region_hotkey="ctrl+shift+r",
stop_hotkey="ctrl+shift+s",
dialogue_rect=Rect(left=1, top=2, width=3, height=4),
tesseract_cmd="C:/Tesseract/tesseract.exe",
voice_source="custom-wav",
default_voice="alba",
voice_path="voice.mp3",
language="english",
quantize_tts=True,
volume_gain=1.75,
last_text="hello",
)
save_config(config, path)
assert load_config(path) == config
def test_load_config_removes_old_region_metadata_from_last_text(tmp_path: Path) -> None:
path = tmp_path / "config.json"
path.write_text(
json.dumps(
{
"last_text": "Capture region: 85,51 628x84\nText region: 85,44 633x77\n\nA line to read."
}
),
encoding="utf-8",
)
assert load_config(path).last_text == "A line to read."
def test_load_config_reads_llm_settings(tmp_path: Path) -> None:
path = tmp_path / "config.json"
path.write_text(
json.dumps(
{
"llm_enabled": True,
"llm_base_url": "http://127.0.0.1:11434/v1",
"llm_api_key": "local",
"llm_model": "unsloth-local",
"llm_timeout": 1.5,
"llm_max_tokens": 64,
"llm_disable_thinking": False,
"llm_image_extraction": True,
"llm_system_prompt": "clean this",
}
),
encoding="utf-8",
)
config = load_config(path)
assert config.llm_enabled is True
assert config.llm_base_url == "http://127.0.0.1:11434/v1"
assert config.llm_api_key == "local"
assert config.llm_model == "unsloth-local"
assert config.llm_timeout == 1.5
assert config.llm_max_tokens == 64
assert config.llm_disable_thinking is False
assert config.llm_image_extraction is True
assert config.llm_system_prompt == "clean this"
def test_load_config_forces_english_language(tmp_path: Path) -> None:
path = tmp_path / "config.json"
path.write_text(json.dumps({"language": "french"}), encoding="utf-8")
assert load_config(path).language == "english"
+118
View File
@@ -0,0 +1,118 @@
from __future__ import annotations
from dataclasses import dataclass
from PIL import Image
from seshat_tts.llm import process_image_with_llm, process_text_with_llm
@dataclass
class _Message:
content: str
@dataclass
class _Choice:
message: _Message
@dataclass
class _Response:
choices: list[_Choice]
class _Completions:
def __init__(self) -> None:
self.kwargs: dict[str, object] | None = None
def create(self, **kwargs: object) -> _Response:
self.kwargs = kwargs
return _Response([_Choice(_Message("Cleaned text."))])
class _Client:
def __init__(self) -> None:
self.chat = type("Chat", (), {"completions": _Completions()})()
def test_llm_disabled_returns_original_text() -> None:
assert (
process_text_with_llm(
" OCR text ",
enabled=False,
base_url="http://127.0.0.1:8000/v1",
api_key="local",
model="unsloth",
system_prompt="clean",
)
== "OCR text"
)
def test_llm_enabled_uses_openai_compatible_chat_client() -> None:
client = _Client()
result = process_text_with_llm(
"OCR text",
enabled=True,
base_url="http://127.0.0.1:8000/v1",
api_key="local",
model="unsloth-model",
system_prompt="clean",
timeout=1,
max_tokens=32,
client=client,
)
assert result == "Cleaned text."
assert client.chat.completions.kwargs is not None
assert client.chat.completions.kwargs["model"] == "unsloth-model"
assert client.chat.completions.kwargs["temperature"] == 0
assert client.chat.completions.kwargs["extra_body"] == {
"chat_template_kwargs": {"enable_thinking": False},
"enable_thinking": False,
"reasoning_effort": "none",
}
def test_llm_can_send_without_disable_thinking_metadata() -> None:
client = _Client()
process_text_with_llm(
"OCR text",
enabled=True,
base_url="http://127.0.0.1:8000/v1",
api_key="local",
model="unsloth-model",
system_prompt="clean",
disable_thinking=False,
client=client,
)
assert client.chat.completions.kwargs is not None
assert "extra_body" not in client.chat.completions.kwargs
def test_llm_can_extract_text_from_image_region() -> None:
client = _Client()
image = Image.new("RGB", (16, 8), "black")
result = process_image_with_llm(
image,
base_url="http://127.0.0.1:8000/v1",
api_key="local",
model="vision-model",
timeout=1,
max_tokens=64,
client=client,
)
assert result == "Cleaned text."
assert client.chat.completions.kwargs is not None
assert client.chat.completions.kwargs["model"] == "vision-model"
messages = client.chat.completions.kwargs["messages"]
user_content = messages[1]["content"]
assert user_content[0]["type"] == "text"
assert user_content[1]["type"] == "image_url"
assert user_content[1]["image_url"]["url"].startswith("data:image/png;base64,")
+22
View File
@@ -0,0 +1,22 @@
from seshat_tts.ocr import extract_text_from_lines
def test_selected_text_does_not_skip_first_line() -> None:
lines = ["A large group of humanoids came from the foothills", "and headed north not long ago."]
assert (
extract_text_from_lines(lines)
== "A large group of humanoids came from the foothills and headed north not long ago."
)
def test_selected_text_includes_choice_marker_text_when_inside_region() -> None:
lines = ["Line to read.", "|. Continue"]
assert extract_text_from_lines(lines) == "Line to read. |. Continue"
def test_selected_text_includes_pipe_marker_without_dot_when_inside_region() -> None:
lines = ["Line to read.", "| Continue"]
assert extract_text_from_lines(lines) == "Line to read. | Continue"
+34
View File
@@ -0,0 +1,34 @@
from pathlib import Path
import queue
from seshat_tts import tts
def test_prepared_audio_prompt_leaves_wav_unchanged(tmp_path: Path) -> None:
source = tmp_path / "voice.wav"
source.write_bytes(b"wav")
assert tts._prepared_audio_prompt_path(source, "english", queue.Queue()) == source
def test_prepared_audio_prompt_converts_mp3_once(tmp_path: Path, monkeypatch) -> None:
source = tmp_path / "voice.mp3"
source.write_bytes(b"mp3")
cache = tmp_path / "cache"
calls: list[tuple[Path, Path]] = []
monkeypatch.setattr(tts, "VOICE_CACHE_DIR", cache)
def fake_convert(input_path: Path, output_path: Path) -> None:
calls.append((input_path, output_path))
output_path.write_bytes(b"wav")
monkeypatch.setattr(tts, "_convert_mp3_to_wav", fake_convert)
first = tts._prepared_audio_prompt_path(source, "english", queue.Queue())
second = tts._prepared_audio_prompt_path(source, "english", queue.Queue())
assert first == second
assert first.suffix == ".wav"
assert first.exists()
assert calls == [(source, first)]