diff --git a/api/pipeline.py b/api/pipeline.py index fef7a3f..d5481c6 100644 --- a/api/pipeline.py +++ b/api/pipeline.py @@ -103,6 +103,7 @@ async def _run_meeting_pipeline(cfg, wav_path, output_dir, instructions, diar_cf model_name=cfg["whisper"]["model"], device=cfg["whisper"]["device"], base_url=cfg["whisper"].get("base_url", ""), + backend=cfg["whisper"].get("backend", "openai"), with_segments=True, ) ) diff --git a/config.py b/config.py index 9b091ae..31f8af0 100644 --- a/config.py +++ b/config.py @@ -13,6 +13,7 @@ DEFAULTS = { "language": "de", "device": "auto", # "auto" = use GPU if ROCm available, else CPU "base_url": "", + "backend": "openai", # "openai" = OpenAI-compatible API, "whispercpp" = whisper.cpp /inference }, "audio": { "device": "", diff --git a/docs/SETUP.md b/docs/SETUP.md index d93d445..65cd23e 100644 --- a/docs/SETUP.md +++ b/docs/SETUP.md @@ -20,34 +20,41 @@ Einstellungsseite. ## Beastix (Server-Setup, einmalig) -### 1. faster-whisper-server installieren +### 1. whisper.cpp mit ROCm/GPU kompilieren + +Voraussetzung: ROCm installiert (Arch: `sudo pacman -S rocm-hip-sdk`). ```bash -sudo pacman -S python-pipx # Arch Linux -pipx install faster-whisper-server -pipx ensurepath +mkdir -p ~/src && cd ~/src +git clone https://github.com/ggml-org/whisper.cpp.git --depth=1 +cd whisper.cpp + +# Für AMD RX 6800 XT (gfx1030) — gfx-Target ggf. anpassen +cmake -B build -DGGML_HIP=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release -DWHISPER_BUILD_SERVER=ON +cmake --build build -j$(nproc) + +# Modell large-v3 herunterladen (~2.9 GB) +bash models/download-ggml-model.sh large-v3 ``` -**Bekannter Bug in Version 0.0.2** — fehlende `pyproject.toml` im pipx-venv: - -```bash -cat > ~/.local/share/pipx/venvs/faster-whisper-server/lib/python*/site-packages/pyproject.toml << 'EOF' -[project] -name = "faster-whisper-server" -version = "0.0.2" -EOF -``` +`gfx1030` = RX 6800 XT. Andere AMD GPUs: `rocminfo | grep gfx` ### 2. Als systemd-User-Service einrichten ```bash -cat > ~/.config/systemd/user/faster-whisper-server.service << 'EOF' +cat > ~/.config/systemd/user/whisper-cpp-server.service << 'EOF' [Unit] -Description=faster-whisper-server (OpenAI-compatible Whisper API) +Description=whisper.cpp Server (ROCm/GPU) After=network.target [Service] -ExecStart=%h/.local/bin/faster-whisper-server --host 0.0.0.0 --port 8000 --model large-v3 +ExecStart=%h/src/whisper.cpp/build/bin/whisper-server \ + --host 0.0.0.0 \ + --port 8080 \ + --model %h/src/whisper.cpp/models/ggml-large-v3.bin \ + --language de \ + --threads 4 \ + --convert Restart=on-failure RestartSec=5 @@ -56,9 +63,12 @@ WantedBy=default.target EOF systemctl --user daemon-reload -systemctl --user enable --now faster-whisper-server.service +systemctl --user enable --now whisper-cpp-server.service ``` +Logs prüfen: `journalctl --user -u whisper-cpp-server -f` +GPU-Nutzung bestätigt wenn in den Logs steht: `using ROCm0 backend` + ### 3. Ollama installieren (falls noch nicht vorhanden) ```bash @@ -105,7 +115,8 @@ Als Admin einloggen → Zahnrad-Icon im Header → Einstellungen: | Feld | Wert (Beispiel) | |------|-----------------| -| Whisper Server URL | `http://beastix:8000` | +| Whisper Backend | `whisper.cpp Server` | +| Whisper Server URL | `http://beastix:8080` | | Whisper Modell | `large-v3` | | Ollama Server URL | `http://beastix:11434` | | Ollama Modell | `gemma3:12b` (aus Dropdown wählen) | diff --git a/frontend/settings.html b/frontend/settings.html index 82ab5cc..f97f902 100644 --- a/frontend/settings.html +++ b/frontend/settings.html @@ -74,9 +74,16 @@

Verarbeitung

+
+ + +
- +
diff --git a/frontend/settings.js b/frontend/settings.js index 034a4d9..52f81cd 100644 --- a/frontend/settings.js +++ b/frontend/settings.js @@ -53,6 +53,7 @@ async function loadConfig() { if (!r.ok) return; const cfg = await r.json(); document.getElementById('audio-device').value = (cfg.audio && cfg.audio.device) || ''; + document.getElementById('whisper-backend').value = (cfg.whisper && cfg.whisper.backend) || 'openai'; document.getElementById('whisper-url').value = (cfg.whisper && cfg.whisper.base_url) || ''; document.getElementById('whisper-model').value = (cfg.whisper && cfg.whisper.model) || 'large-v3'; const ollamaUrl = (cfg.ollama && cfg.ollama.base_url) || 'http://localhost:11434'; @@ -96,6 +97,7 @@ document.getElementById('save-btn').addEventListener('click', async function() { whisper: { base_url: document.getElementById('whisper-url').value, model: document.getElementById('whisper-model').value, + backend: document.getElementById('whisper-backend').value, }, ollama: { base_url: document.getElementById('ollama-url').value, diff --git a/transcription.py b/transcription.py index e0e1630..3247062 100644 --- a/transcription.py +++ b/transcription.py @@ -27,8 +27,13 @@ class TranscriptionEngine: device: str = "auto", base_url: str = "", with_segments: bool = False, + backend: str = "openai", ) -> Union[str, list[dict]]: if base_url: + if backend == "whispercpp": + return await self._transcribe_remote_whispercpp( + audio_path, language, base_url, with_segments + ) return await self._transcribe_remote( audio_path, language, model_name, base_url, with_segments ) @@ -67,6 +72,35 @@ class TranscriptionEngine: ] return [{"start": 0.0, "end": 9999.0, "text": body["text"].strip()}] + async def _transcribe_remote_whispercpp( + self, + audio_path: str, + language: str, + base_url: str, + with_segments: bool, + ) -> Union[str, list[dict]]: + async with httpx.AsyncClient(timeout=300) as client: + with open(audio_path, "rb") as f: + data = {"language": language} + if with_segments: + data["response_format"] = "verbose_json" + r = await client.post( + f"{base_url}/inference", + files={"file": ("audio.wav", f, "audio/wav")}, + data=data, + ) + r.raise_for_status() + body = r.json() + if not with_segments: + return body.get("text", "").strip() + raw_segs = body.get("segments") or [] + if raw_segs: + return [ + {"start": s["start"], "end": s["end"], "text": s["text"].strip()} + for s in raw_segs + ] + return [{"start": 0.0, "end": 9999.0, "text": body.get("text", "").strip()}] + async def _transcribe_local( self, audio_path: str,