feat: meeting pipeline — parallel diarization, speaker ID, 3-doc output

This commit is contained in:
2026-04-02 01:13:24 +02:00
parent 37e432f7fa
commit e04816fce6
+119 -25
View File
@@ -1,16 +1,18 @@
import asyncio
import logging import logging
import os import os
import tempfile import tempfile
import traceback import traceback
from datetime import datetime
from api.state import state, Status from api.state import state, Status
from api.router import broadcast
logger = logging.getLogger(__name__)
from config import load as load_config from config import load as load_config
from transcription import engine as transcription_engine from transcription import engine as transcription_engine
from llm import OllamaClient from llm import OllamaClient
from output import save_transcript from output import save_transcript, write_meeting_docs
from api.router import broadcast
logger = logging.getLogger(__name__)
async def run_pipeline(): async def run_pipeline():
@@ -21,6 +23,8 @@ async def run_pipeline():
output_dir = getattr(state, "_recording_output_dir", cfg["output"]["path"]) output_dir = getattr(state, "_recording_output_dir", cfg["output"]["path"])
instructions = getattr(state, "_recording_instructions", "") instructions = getattr(state, "_recording_instructions", "")
diar_cfg = cfg.get("diarization", {})
use_diarization = diar_cfg.get("enabled") and diar_cfg.get("hf_token")
recorder.stop() recorder.stop()
await state.set_status(Status.PROCESSING) await state.set_status(Status.PROCESSING)
@@ -32,6 +36,33 @@ async def run_pipeline():
wav_path = f.name wav_path = f.name
recorder.save_wav(wav_path) recorder.save_wav(wav_path)
if use_diarization:
await _run_meeting_pipeline(cfg, wav_path, output_dir, instructions, diar_cfg)
else:
await _run_solo_pipeline(cfg, wav_path, output_dir, instructions)
except Exception as e:
tb = traceback.format_exc()
logger.error("Pipeline error:\n%s", tb)
state.last_error = str(e)
await state.set_status(Status.ERROR)
await broadcast({"event": "error", "message": str(e)})
finally:
state.recording_user = None
state._recording_output_dir = None
state._recording_instructions = ""
state._speakers_event = None
state._pending_aligned_segments = None
state._speaker_names = None
if wav_path:
try:
os.unlink(wav_path)
except OSError:
pass
async def _run_solo_pipeline(cfg, wav_path, output_dir, instructions):
"""Original single-document pipeline (no diarization)."""
raw_text = await transcription_engine.transcribe_file( raw_text = await transcription_engine.transcribe_file(
wav_path, wav_path,
language=cfg["whisper"]["language"], language=cfg["whisper"]["language"],
@@ -47,7 +78,6 @@ async def run_pipeline():
instructions=instructions, instructions=instructions,
model=cfg["ollama"]["model"], model=cfg["ollama"]["model"],
) )
await broadcast({"event": "refined", "markdown": refined})
title = "Diktat" title = "Diktat"
for line in refined.splitlines(): for line in refined.splitlines():
@@ -55,26 +85,90 @@ async def run_pipeline():
title = line[2:].strip() title = line[2:].strip()
break break
path = save_transcript( path = save_transcript(title=title, content=refined, output_dir=output_dir)
title=title,
content=refined,
output_dir=output_dir,
)
await broadcast({"event": "saved", "path": path, "title": title}) await broadcast({"event": "saved", "path": path, "title": title})
await state.set_status(Status.IDLE) await state.set_status(Status.IDLE)
except Exception as e:
tb = traceback.format_exc() async def _run_meeting_pipeline(cfg, wav_path, output_dir, instructions, diar_cfg):
logger.error("Pipeline error:\n%s", tb) """Diarization pipeline: 3 documents, speaker identification."""
state.last_error = str(e) from diarization import Diarizer
await state.set_status(Status.ERROR) from alignment import align_segments
await broadcast({"event": "error", "message": str(e)})
finally: diarizer = Diarizer(hf_token=diar_cfg["hf_token"])
state.recording_user = None whisper_task = asyncio.create_task(
state._recording_output_dir = None transcription_engine.transcribe_file(
state._recording_instructions = "" wav_path,
if wav_path: language=cfg["whisper"]["language"],
try: model_name=cfg["whisper"]["model"],
os.unlink(wav_path) device=cfg["whisper"]["device"],
except OSError: base_url=cfg["whisper"].get("base_url", ""),
pass with_segments=True,
)
)
diar_task = asyncio.create_task(diarizer.diarize(wav_path))
whisper_segs, speaker_segs = await asyncio.gather(whisper_task, diar_task)
aligned = align_segments(whisper_segs, speaker_segs)
await broadcast({"event": "transcribed", "raw": " ".join(t for _, t in aligned)})
excerpt = "\n".join(f"{s}: {t}" for s, t in aligned[:20])
client = OllamaClient(base_url=cfg["ollama"]["base_url"])
name_map = await client.identify_speakers(excerpt, model=cfg["ollama"]["model"])
if not name_map:
excerpts_per_speaker = _build_excerpts(aligned)
state._speakers_event = asyncio.Event()
state._pending_aligned_segments = aligned
await state.set_status(Status.AWAITING_SPEAKERS)
await broadcast({"event": "speakers_unknown", "speakers": [
{"id": spk, "excerpts": exs}
for spk, exs in excerpts_per_speaker.items()
]})
await state._speakers_event.wait()
name_map = state._speaker_names or {}
def resolve(label):
name = name_map.get(label, "")
if name:
return name
num = label.replace("SPEAKER_", "").lstrip("0") or "1"
return f"Sprecher {num}"
named_aligned = [(resolve(spk), text) for spk, text in aligned]
speakers = sorted({spk for spk, _ in named_aligned})
total_secs = sum(s["end"] - s["start"] for s in whisper_segs) if whisper_segs else 0
duration_min = max(1, round(total_secs / 60))
transcript_text = "\n\n".join(f"**{spk}:** {txt}" for spk, txt in named_aligned)
summary = await client.summarize(transcript_text, model=cfg["ollama"]["model"])
dt = datetime.now()
paths = write_meeting_docs(
aligned_segments=named_aligned,
summary=summary,
speakers=speakers,
duration_min=duration_min,
output_dir=output_dir,
dt=dt,
)
await state.set_status(Status.IDLE)
await broadcast({
"event": "saved",
"path": paths["index"],
"title": f"Meeting {dt.strftime('%d.%m.%Y %H:%M')}",
"meeting": True,
"paths": paths,
})
def _build_excerpts(aligned: list[tuple[str, str]], max_per_speaker: int = 4) -> dict[str, list[str]]:
"""Build a dict of speaker → list of text excerpts."""
from collections import defaultdict
buckets: dict[str, list[str]] = defaultdict(list)
for spk, text in aligned:
if len(buckets[spk]) < max_per_speaker:
buckets[spk].append(text[:200])
return dict(buckets)