feat: add whisper.cpp ROCm backend support for AMD GPU acceleration

- transcription.py: new _transcribe_remote_whispercpp() using /inference endpoint - transcription.py: backend param routes to openai or whispercpp remote path - config.py: whisper.backend default 'openai', alt 'whispercpp' - pipeline.py: passes backend from config to transcribe_file - settings: backend dropdown (OpenAI-compat / whisper.cpp) - SETUP.md: whisper.cpp ROCm build and systemd setup instructions whisper-cpp-server running on beastix :8080 (ROCm0, gfx1030, RX 6800 XT)
2026-04-02 01:33:32 +02:00
parent 56d41b8620
commit c7cad4bb2a
6 changed files with 75 additions and 19 deletions
@@ -27,8 +27,13 @@ class TranscriptionEngine:
        device: str = "auto",
        base_url: str = "",
        with_segments: bool = False,
+        backend: str = "openai",
    ) -> Union[str, list[dict]]:
        if base_url:
+            if backend == "whispercpp":
+                return await self._transcribe_remote_whispercpp(
+                    audio_path, language, base_url, with_segments
+                )
            return await self._transcribe_remote(
                audio_path, language, model_name, base_url, with_segments
            )
@@ -67,6 +72,35 @@ class TranscriptionEngine:
            ]
        return [{"start": 0.0, "end": 9999.0, "text": body["text"].strip()}]

+    async def _transcribe_remote_whispercpp(
+        self,
+        audio_path: str,
+        language: str,
+        base_url: str,
+        with_segments: bool,
+    ) -> Union[str, list[dict]]:
+        async with httpx.AsyncClient(timeout=300) as client:
+            with open(audio_path, "rb") as f:
+                data = {"language": language}
+                if with_segments:
+                    data["response_format"] = "verbose_json"
+                r = await client.post(
+                    f"{base_url}/inference",
+                    files={"file": ("audio.wav", f, "audio/wav")},
+                    data=data,
+                )
+            r.raise_for_status()
+            body = r.json()
+        if not with_segments:
+            return body.get("text", "").strip()
+        raw_segs = body.get("segments") or []
+        if raw_segs:
+            return [
+                {"start": s["start"], "end": s["end"], "text": s["text"].strip()}
+                for s in raw_segs
+            ]
+        return [{"start": 0.0, "end": 9999.0, "text": body.get("text", "").strip()}]
+
    async def _transcribe_local(
        self,
        audio_path: str,