tueit_Transkriptor/transcription.py

import asyncio
import httpx


class TranscriptionEngine:
    _model = None

    def _get_model(self, model_name: str = "large-v3", device: str = "auto"):
        if self._model is None:
            from faster_whisper import WhisperModel
            if device == "auto":
                try:
                    self._model = WhisperModel(model_name, device="cuda", compute_type="float16")
                except Exception:
                    self._model = WhisperModel(model_name, device="cpu", compute_type="int8")
            else:
                compute = "float16" if device in ("cuda", "rocm") else "int8"
                self._model = WhisperModel(model_name, device=device, compute_type=compute)
        return self._model

    async def transcribe_file(
        self,
        audio_path: str,
        language: str = "de",
        model_name: str = "large-v3",
        device: str = "auto",
        base_url: str = "",
    ) -> str:
        if base_url:
            return await self._transcribe_remote(audio_path, language, model_name, base_url)
        return await self._transcribe_local(audio_path, language, model_name, device)

    async def _transcribe_remote(
        self, audio_path: str, language: str, model_name: str, base_url: str
    ) -> str:
        async with httpx.AsyncClient(timeout=300) as client:
            with open(audio_path, "rb") as f:
                r = await client.post(
                    f"{base_url}/v1/audio/transcriptions",
                    files={"file": ("audio.wav", f, "audio/wav")},
                    data={"model": model_name, "language": language},
                )
            r.raise_for_status()
            return r.json()["text"]

    async def _transcribe_local(
        self, audio_path: str, language: str, model_name: str, device: str
    ) -> str:
        loop = asyncio.get_running_loop()
        model = self._get_model(model_name, device)
        segments, _ = await loop.run_in_executor(
            None,
            lambda: model.transcribe(audio_path, language=language),
        )
        return "".join(seg.text for seg in segments).strip()


engine = TranscriptionEngine()