Files
tueit_Transkriptor/diarization.py
T
2026-04-02 11:34:02 +02:00

30 lines
1.1 KiB
Python

import asyncio
class Diarizer:
def __init__(self, hf_token: str):
if not hf_token:
raise ValueError("hf_token is required for diarization")
self._hf_token = hf_token
self._pipeline = None
def _load_pipeline(self):
if self._pipeline is None:
from pyannote.audio import Pipeline
self._pipeline = Pipeline.from_pretrained(
"pyannote/speaker-diarization-3.1",
token=self._hf_token,
)
return self._pipeline
async def diarize(self, wav_path: str) -> list[tuple[float, float, str]]:
loop = asyncio.get_running_loop()
pipeline = await loop.run_in_executor(None, self._load_pipeline)
result = await loop.run_in_executor(None, lambda: pipeline(wav_path))
# pyannote 4.x returns DiarizeOutput; older versions return Annotation directly
annotation = getattr(result, "speaker_diarization", result)
return [
(turn.start, turn.end, speaker)
for turn, _, speaker in annotation.itertracks(yield_label=True)
]