# tüit Transkriptor Implementation Plan > **For Claude:** REQUIRED SUB-SKILL: Use superpowers:executing-plans to implement this plan task-by-task. **Goal:** Build a local AI transcription desktop tool with system tray icon, audio capture, faster-whisper transcription, Ollama LLM post-processing, and a browser-based UI that saves Markdown files to a Nextcloud-synced folder. **Architecture:** pystray tray icon + FastAPI local server (port 8765) + browser UI. Audio captured via sounddevice, transcribed via faster-whisper (ROCm), refined via Ollama (gemma3:12b). SIGUSR1 toggles recording for Wayland-compatible hotkey support. **Tech Stack:** Python 3.11+, FastAPI, uvicorn, pystray, Pillow, sounddevice, faster-whisper, httpx, tomllib (stdlib 3.11+) --- ### Task 1: Project Scaffold **Files:** - Create: `requirements.txt` - Create: `.gitignore` - Create: `CLAUDE.md` **Step 1: Create requirements.txt** ``` fastapi>=0.111 uvicorn[standard]>=0.29 pystray>=0.19 Pillow>=10.0 sounddevice>=0.4.6 faster-whisper>=1.0.3 httpx>=0.27 numpy>=1.26 tomli_w>=1.0 pytest>=8.0 pytest-asyncio>=0.23 ``` **Step 2: Create .gitignore** ``` __pycache__/ *.pyc *.pyo .venv/ venv/ *.egg-info/ dist/ .env data/ ``` **Step 3: Create pytest.ini** ```ini [pytest] asyncio_mode = auto ``` **Step 4: Create CLAUDE.md** ```markdown # CLAUDE.md — tüit Transkriptor Desktop transcription tool. Python, no Docker. ## Key Commands # Install dependencies pip install -r requirements.txt # Run python main.py # Run tests pytest -v # Trigger recording toggle via signal pkill -USR1 -f main.py ## Architecture See docs/plans/2026-04-01-desktop-transcription-design.md ``` **Step 5: Commit** ```bash git add requirements.txt .gitignore CLAUDE.md pytest.ini git commit -m "chore: project scaffold" ``` --- ### Task 2: Config Module **Files:** - Create: `config.py` - Create: `tests/__init__.py` - Create: `tests/test_config.py` **Step 1: Write failing tests** ```python # tests/test_config.py import os import tempfile from unittest.mock import patch def test_config_loads_defaults(): with tempfile.TemporaryDirectory() as tmpdir: cfg_path = os.path.join(tmpdir, "config.toml") with patch("config.CONFIG_PATH", cfg_path): import importlib, config importlib.reload(config) cfg = config.load() assert cfg["ollama"]["model"] == "gemma3:12b" assert cfg["whisper"]["model"] == "large-v3" assert cfg["server"]["port"] == 8765 def test_config_creates_file_on_first_run(): with tempfile.TemporaryDirectory() as tmpdir: cfg_path = os.path.join(tmpdir, "config.toml") with patch("config.CONFIG_PATH", cfg_path): import importlib, config importlib.reload(config) config.load() assert os.path.exists(cfg_path) ``` **Step 2: Run tests to verify they fail** ```bash pytest tests/test_config.py -v ``` Expected: FAIL — `ModuleNotFoundError: No module named 'config'` **Step 3: Implement config.py** ```python import os import tomllib CONFIG_PATH = os.path.expanduser("~/.config/tueit-transcriber/config.toml") DEFAULTS = { "ollama": { "base_url": "http://localhost:11434", "model": "gemma3:12b", }, "whisper": { "model": "large-v3", "language": "de", "device": "auto", # "auto" = use GPU if ROCm available, else CPU }, "server": { "port": 8765, }, "output": { "path": os.path.expanduser( "~/cloud.shron.de/Hetzner Storagebox/work" ), }, "pid_file": os.path.expanduser("~/.local/run/tueit-transcriber.pid"), } def load() -> dict: os.makedirs(os.path.dirname(CONFIG_PATH), exist_ok=True) if not os.path.exists(CONFIG_PATH): _write_defaults() with open(CONFIG_PATH, "rb") as f: on_disk = tomllib.load(f) return _deep_merge(DEFAULTS, on_disk) def _deep_merge(base: dict, override: dict) -> dict: result = dict(base) for k, v in override.items(): if k in result and isinstance(result[k], dict) and isinstance(v, dict): result[k] = _deep_merge(result[k], v) else: result[k] = v return result def _write_defaults(): try: import tomli_w with open(CONFIG_PATH, "wb") as f: tomli_w.dump(DEFAULTS, f) except ImportError: with open(CONFIG_PATH, "w") as f: f.write("# tüit Transkriptor config\n\n") f.write('[ollama]\nbase_url = "http://localhost:11434"\nmodel = "gemma3:12b"\n\n') f.write('[whisper]\nmodel = "large-v3"\nlanguage = "de"\ndevice = "auto"\n\n') f.write('[server]\nport = 8765\n\n') f.write(f'[output]\npath = "{DEFAULTS["output"]["path"]}"\n') ``` **Step 4: Run tests to verify they pass** ```bash pytest tests/test_config.py -v ``` Expected: PASS **Step 5: Commit** ```bash git add config.py tests/__init__.py tests/test_config.py git commit -m "feat: config module with TOML defaults" ``` --- ### Task 3: Output Module **Files:** - Create: `output.py` - Create: `tests/test_output.py` **Step 1: Write failing tests** ```python # tests/test_output.py import os import tempfile from datetime import datetime def test_save_transcript_creates_file(): with tempfile.TemporaryDirectory() as tmpdir: from output import save_transcript path = save_transcript( title="Test Aufnahme", content="Dies ist ein Test.", output_dir=tmpdir, dt=datetime(2026, 4, 1, 14, 32, 0), ) assert os.path.exists(path) def test_save_transcript_filename_format(): with tempfile.TemporaryDirectory() as tmpdir: from output import save_transcript path = save_transcript( title="Mein erstes Diktat", content="Inhalt.", output_dir=tmpdir, dt=datetime(2026, 4, 1, 14, 32, 0), ) assert os.path.basename(path) == "2026-04-01-1432-mein-erstes-diktat.md" def test_save_transcript_contains_frontmatter(): with tempfile.TemporaryDirectory() as tmpdir: from output import save_transcript path = save_transcript( title="Test", content="Inhalt.", output_dir=tmpdir, dt=datetime(2026, 4, 1, 14, 32, 0), ) text = open(path).read() assert "---" in text assert "date:" in text assert "transkript" in text def test_save_transcript_contains_content(): with tempfile.TemporaryDirectory() as tmpdir: from output import save_transcript path = save_transcript( title="Test", content="Das ist der Inhalt.", output_dir=tmpdir, dt=datetime(2026, 4, 1, 14, 32, 0), ) assert "Das ist der Inhalt." in open(path).read() def test_slugify(): from output import slugify assert slugify("Mein erstes Diktat") == "mein-erstes-diktat" assert slugify("test -- foo") == "test-foo" ``` **Step 2: Run to verify failure** ```bash pytest tests/test_output.py -v ``` Expected: FAIL **Step 3: Implement output.py** ```python import os import re import unicodedata from datetime import datetime def slugify(text: str) -> str: for src, dst in [("ä","a"),("ö","o"),("ü","u"),("Ä","a"),("Ö","o"),("Ü","u"),("ß","ss")]: text = text.replace(src, dst) text = unicodedata.normalize("NFKD", text) text = "".join(c for c in text if unicodedata.category(c) != "Mn") text = text.lower() text = re.sub(r"[^a-z0-9]+", "-", text) return text.strip("-") def save_transcript( title: str, content: str, output_dir: str, dt: datetime | None = None, ) -> str: if dt is None: dt = datetime.now() slug = slugify(title)[:60] filename = f"{dt.strftime('%Y-%m-%d-%H%M')}-{slug}.md" os.makedirs(output_dir, exist_ok=True) path = os.path.join(output_dir, filename) with open(path, "w", encoding="utf-8") as f: f.write(f"---\ndate: {dt.isoformat(timespec='seconds')}\ntags: [transkript]\n---\n\n") f.write(f"# {title}\n\n") f.write(content) if not content.endswith("\n"): f.write("\n") return path def list_transcripts(output_dir: str, limit: int = 20) -> list[dict]: if not os.path.exists(output_dir): return [] files = sorted( [f for f in os.listdir(output_dir) if f.endswith(".md")], reverse=True, )[:limit] result = [] for f in files: full = os.path.join(output_dir, f) stat = os.stat(full) result.append({"filename": f, "path": full, "size": stat.st_size, "mtime": stat.st_mtime}) return result ``` **Step 4: Run tests to verify they pass** ```bash pytest tests/test_output.py -v ``` Expected: PASS **Step 5: Commit** ```bash git add output.py tests/test_output.py git commit -m "feat: output module — Markdown file writer with slugified filenames" ``` --- ### Task 4: LLM Module **Files:** - Create: `llm.py` - Create: `tests/test_llm.py` **Step 1: Write failing tests** ```python # tests/test_llm.py import pytest from unittest.mock import AsyncMock, patch, MagicMock @pytest.mark.asyncio async def test_refine_calls_ollama(): from llm import OllamaClient mock_response = MagicMock() mock_response.json.return_value = {"response": "# Titel\n\nInhalt."} mock_response.raise_for_status = MagicMock() with patch("httpx.AsyncClient") as MockClient: instance = MockClient.return_value.__aenter__.return_value instance.post = AsyncMock(return_value=mock_response) client = OllamaClient(base_url="http://localhost:11434") result = await client.refine( raw_text="Das ist ein test.", instructions="Mach eine Zusammenfassung.", model="gemma3:12b", ) assert "Inhalt" in result instance.post.assert_called_once() @pytest.mark.asyncio async def test_list_models_returns_list(): from llm import OllamaClient mock_response = MagicMock() mock_response.json.return_value = {"models": [{"name": "gemma3:12b"}, {"name": "mistral:7b"}]} mock_response.raise_for_status = MagicMock() with patch("httpx.AsyncClient") as MockClient: instance = MockClient.return_value.__aenter__.return_value instance.get = AsyncMock(return_value=mock_response) client = OllamaClient(base_url="http://localhost:11434") models = await client.list_models() assert "gemma3:12b" in models ``` **Step 2: Run to verify failure** ```bash pytest tests/test_llm.py -v ``` Expected: FAIL **Step 3: Implement llm.py** ```python import httpx SYSTEM_PROMPT = """Du bist ein präziser Schreibassistent. Du bekommst einen rohen Sprachtranskript und optionale Instruktionen des Nutzers. Deine Aufgabe: 1. Bereinige den Text (Füllwörter, Wiederholungen, Tippfehler) 2. Strukturiere ihn mit Markdown-Überschriften wenn sinnvoll 3. Erzeuge einen passenden deutschen Titel als H1 4. Beachte Instruktionen des Nutzers wenn vorhanden 5. Antworte NUR mit dem fertigen Markdown — kein Kommentar, keine Erklärung Format: # Titel Inhalt... """ class OllamaClient: def __init__(self, base_url: str = "http://localhost:11434"): self.base_url = base_url async def list_models(self) -> list[str]: async with httpx.AsyncClient() as client: r = await client.get(f"{self.base_url}/api/tags") r.raise_for_status() return [m["name"] for m in r.json().get("models", [])] async def refine( self, raw_text: str, instructions: str = "", model: str = "gemma3:12b", ) -> str: prompt = f"Transkript:\n{raw_text}" if instructions.strip(): prompt += f"\n\nInstruktionen:\n{instructions.strip()}" async with httpx.AsyncClient(timeout=120) as client: r = await client.post( f"{self.base_url}/api/generate", json={"model": model, "prompt": prompt, "system": SYSTEM_PROMPT, "stream": False}, ) r.raise_for_status() return r.json()["response"] ``` **Step 4: Run tests to verify they pass** ```bash pytest tests/test_llm.py -v ``` Expected: PASS **Step 5: Commit** ```bash git add llm.py tests/test_llm.py git commit -m "feat: LLM module — Ollama client with transcript refinement" ``` --- ### Task 5: Transcription Module **Files:** - Create: `transcription.py` - Create: `tests/test_transcription.py` **Step 1: Write failing tests** ```python # tests/test_transcription.py import asyncio from unittest.mock import MagicMock def test_transcription_engine_is_singleton(): from transcription import engine, TranscriptionEngine assert isinstance(engine, TranscriptionEngine) def test_transcribe_file_calls_whisper(tmp_path): wav = tmp_path / "test.wav" wav.write_bytes(b"\x00" * 100) mock_model = MagicMock() mock_segment = MagicMock() mock_segment.text = " Hallo Welt" mock_model.transcribe.return_value = ([mock_segment], MagicMock()) from transcription import TranscriptionEngine eng = TranscriptionEngine() eng._model = mock_model result = asyncio.run(eng.transcribe_file(str(wav), language="de")) assert result == "Hallo Welt" mock_model.transcribe.assert_called_once_with(str(wav), language="de") ``` **Step 2: Run to verify failure** ```bash pytest tests/test_transcription.py -v ``` Expected: FAIL **Step 3: Implement transcription.py** ```python import asyncio class TranscriptionEngine: _model = None def _get_model(self, model_name: str = "large-v3", device: str = "auto"): if self._model is None: from faster_whisper import WhisperModel if device == "auto": try: self._model = WhisperModel(model_name, device="cuda", compute_type="float16") except Exception: self._model = WhisperModel(model_name, device="cpu", compute_type="int8") else: compute = "float16" if device in ("cuda", "rocm") else "int8" self._model = WhisperModel(model_name, device=device, compute_type=compute) return self._model async def transcribe_file( self, audio_path: str, language: str = "de", model_name: str = "large-v3", device: str = "auto", ) -> str: loop = asyncio.get_event_loop() model = self._get_model(model_name, device) segments, _ = await loop.run_in_executor( None, lambda: model.transcribe(audio_path, language=language), ) return "".join(seg.text for seg in segments).strip() engine = TranscriptionEngine() ``` **Step 4: Run tests to verify they pass** ```bash pytest tests/test_transcription.py -v ``` Expected: PASS **Step 5: Commit** ```bash git add transcription.py tests/test_transcription.py git commit -m "feat: transcription module — faster-whisper with ROCm auto-detect" ``` --- ### Task 6: Audio Module **Files:** - Create: `audio.py` - Create: `tests/test_audio.py` **Step 1: Write failing tests** ```python # tests/test_audio.py import numpy as np from unittest.mock import patch, MagicMock def test_recorder_starts_and_stops(): from audio import AudioRecorder with patch("sounddevice.InputStream") as MockStream: mock_stream = MagicMock() MockStream.return_value.start = MagicMock() MockStream.return_value.stop = MagicMock() MockStream.return_value.close = MagicMock() recorder = AudioRecorder(sample_rate=16000) assert not recorder.is_recording recorder._stream = MockStream.return_value recorder.is_recording = True recorder.stop() assert not recorder.is_recording def test_recorder_save_wav(tmp_path): import wave from audio import AudioRecorder recorder = AudioRecorder(sample_rate=16000) recorder._buffer = [np.zeros(1600, dtype=np.int16)] out = str(tmp_path / "test.wav") recorder.save_wav(out) with wave.open(out) as wf: assert wf.getframerate() == 16000 assert wf.getnchannels() == 1 ``` **Step 2: Run to verify failure** ```bash pytest tests/test_audio.py -v ``` Expected: FAIL **Step 3: Implement audio.py** ```python import wave import threading import numpy as np class AudioRecorder: def __init__(self, sample_rate: int = 16000): self.sample_rate = sample_rate self._buffer: list[np.ndarray] = [] self._stream = None self.is_recording = False self._lock = threading.Lock() def _callback(self, indata, frames, time, status): if self.is_recording: with self._lock: self._buffer.append(indata[:, 0].copy().astype(np.int16)) def start(self): import sounddevice as sd self._buffer = [] self.is_recording = True self._stream = sd.InputStream( samplerate=self.sample_rate, channels=1, dtype="int16", callback=self._callback, ) self._stream.start() def stop(self): self.is_recording = False if self._stream: self._stream.stop() self._stream.close() self._stream = None def save_wav(self, path: str) -> str: with self._lock: data = np.concatenate(self._buffer) if self._buffer else np.zeros(0, dtype=np.int16) with wave.open(path, "wb") as wf: wf.setnchannels(1) wf.setsampwidth(2) wf.setframerate(self.sample_rate) wf.writeframes(data.tobytes()) return path ``` **Step 4: Run tests to verify they pass** ```bash pytest tests/test_audio.py -v ``` Expected: PASS **Step 5: Commit** ```bash git add audio.py tests/test_audio.py git commit -m "feat: audio module — sounddevice recorder with WAV export" ``` --- ### Task 7: App State Module **Files:** - Create: `api/__init__.py` - Create: `api/state.py` **Step 1: Implement** ```python # api/__init__.py # (empty) ``` ```python # api/state.py import asyncio from dataclasses import dataclass, field from enum import Enum from typing import Callable class Status(str, Enum): IDLE = "idle" RECORDING = "recording" PROCESSING = "processing" ERROR = "error" @dataclass class AppState: status: Status = Status.IDLE instructions: str = "" last_transcript_path: str | None = None last_error: str | None = None _listeners: list[Callable] = field(default_factory=list, repr=False) def subscribe(self, callback: Callable): self._listeners.append(callback) async def notify(self): for cb in self._listeners: if asyncio.iscoroutinefunction(cb): await cb(self) else: cb(self) async def set_status(self, status: Status): self.status = status await self.notify() state = AppState() ``` **Step 2: Commit** ```bash git add api/__init__.py api/state.py git commit -m "feat: app state module with status enum and subscriber pattern" ``` --- ### Task 8: API Router + Pipeline **Files:** - Create: `api/router.py` - Create: `api/pipeline.py` - Create: `tests/test_api.py` **Step 1: Write failing tests** ```python # tests/test_api.py from fastapi.testclient import TestClient def make_app(): from fastapi import FastAPI from api.router import router app = FastAPI() app.include_router(router) return app def test_status_returns_idle(): client = TestClient(make_app()) r = client.get("/status") assert r.status_code == 200 assert r.json()["status"] == "idle" def test_config_get_returns_dict(): client = TestClient(make_app()) r = client.get("/config") assert r.status_code == 200 assert "ollama" in r.json() def test_transcripts_returns_list(): client = TestClient(make_app()) r = client.get("/transcripts") assert r.status_code == 200 assert isinstance(r.json(), list) ``` **Step 2: Run to verify failure** ```bash pytest tests/test_api.py -v ``` Expected: FAIL **Step 3: Implement api/router.py** ```python # api/router.py import asyncio import os from fastapi import APIRouter, WebSocket, WebSocketDisconnect from api.state import state, Status from config import load as load_config from output import list_transcripts router = APIRouter() _ws_clients: list[WebSocket] = [] @router.get("/status") async def get_status(): return {"status": state.status, "instructions": state.instructions} @router.post("/toggle") async def toggle_recording(): from api.pipeline import run_pipeline if state.status == Status.RECORDING: asyncio.create_task(run_pipeline()) return {"action": "stopped"} if state.status == Status.IDLE: from audio import AudioRecorder state._recorder = AudioRecorder() state._recorder.start() await state.set_status(Status.RECORDING) return {"action": "started"} return {"action": "busy", "status": state.status} @router.post("/instructions") async def set_instructions(body: dict): state.instructions = body.get("instructions", "") return {"ok": True} @router.get("/transcripts") async def get_transcripts(): cfg = load_config() return list_transcripts(cfg["output"]["path"]) @router.get("/config") async def get_config(): return load_config() @router.put("/config") async def put_config(body: dict): cfg = load_config() cfg.update(body) return cfg @router.post("/open") async def open_file(body: dict): import subprocess path = body.get("path", "") if path and os.path.exists(path): subprocess.Popen(["xdg-open", path]) return {"ok": True} @router.websocket("/ws") async def websocket_endpoint(ws: WebSocket): await ws.accept() _ws_clients.append(ws) try: while True: await ws.receive_text() except WebSocketDisconnect: if ws in _ws_clients: _ws_clients.remove(ws) async def broadcast(message: dict): for ws in list(_ws_clients): try: await ws.send_json(message) except Exception: if ws in _ws_clients: _ws_clients.remove(ws) ``` **Step 4: Implement api/pipeline.py** ```python # api/pipeline.py import os import tempfile from api.state import state, Status from config import load as load_config from transcription import engine as transcription_engine from llm import OllamaClient from output import save_transcript from api.router import broadcast async def run_pipeline(): cfg = load_config() recorder = getattr(state, "_recorder", None) if recorder is None: return recorder.stop() await state.set_status(Status.PROCESSING) await broadcast({"event": "processing"}) wav_path = None try: with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f: wav_path = f.name recorder.save_wav(wav_path) raw_text = await transcription_engine.transcribe_file( wav_path, language=cfg["whisper"]["language"], model_name=cfg["whisper"]["model"], device=cfg["whisper"]["device"], ) await broadcast({"event": "transcribed", "raw": raw_text}) client = OllamaClient(base_url=cfg["ollama"]["base_url"]) refined = await client.refine( raw_text=raw_text, instructions=state.instructions, model=cfg["ollama"]["model"], ) await broadcast({"event": "refined", "markdown": refined}) title = "Diktat" for line in refined.splitlines(): if line.startswith("# "): title = line[2:].strip() break path = save_transcript( title=title, content=refined, output_dir=cfg["output"]["path"], ) state.last_transcript_path = path await broadcast({"event": "saved", "path": path, "title": title}) await state.set_status(Status.IDLE) except Exception as e: state.last_error = str(e) await state.set_status(Status.ERROR) await broadcast({"event": "error", "message": str(e)}) finally: if wav_path: try: os.unlink(wav_path) except OSError: pass ``` **Step 5: Run tests to verify they pass** ```bash pytest tests/test_api.py -v ``` Expected: PASS **Step 6: Commit** ```bash git add api/router.py api/pipeline.py tests/test_api.py git commit -m "feat: API router + pipeline — toggle, status, transcripts, WebSocket" ``` --- ### Task 9: Frontend **Files:** - Create: `frontend/index.html` - Create: `frontend/app.js` **Step 1: Create frontend/index.html** ```html