fix: punctuate raw transcript, strip JSON code fences, filter null speaker names

- llm: punctuate() adds punctuation/capitalisation without changing words
- llm: _strip_code_fences() handles markdown-wrapped JSON from gemma3
- llm: filter string 'null' from identify_speakers result
- pipeline: punctuate raw_text in parallel with refine for solo recordings

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-02 12:23:25 +02:00
parent d3582eaeb7
commit 658f9be47f
2 changed files with 37 additions and 8 deletions
+5 -6
View File
@@ -74,15 +74,14 @@ async def _run_solo_pipeline(cfg, wav_path, output_dir, instructions):
await broadcast({"event": "transcribed", "raw": raw_text}) await broadcast({"event": "transcribed", "raw": raw_text})
client = OllamaClient(base_url=cfg["ollama"]["base_url"]) client = OllamaClient(base_url=cfg["ollama"]["base_url"])
refined = await client.refine( punctuated, refined = await asyncio.gather(
raw_text=raw_text, client.punctuate(raw_text, model=cfg["ollama"]["model"]),
instructions=instructions, client.refine(raw_text=raw_text, instructions=instructions, model=cfg["ollama"]["model"]),
model=cfg["ollama"]["model"],
) )
title, tldr = await client.generate_title_and_tldr(refined, model=cfg["ollama"]["model"])
dt = datetime.now() dt = datetime.now()
title, tldr = await client.generate_title_and_tldr(refined, model=cfg["ollama"]["model"]) paths = write_solo_docs(raw_text=punctuated, refined=refined, output_dir=output_dir, dt=dt, title=title, tldr=tldr)
paths = write_solo_docs(raw_text=raw_text, refined=refined, output_dir=output_dir, dt=dt, title=title, tldr=tldr)
await state.set_status(Status.IDLE) await state.set_status(Status.IDLE)
await broadcast({ await broadcast({
+32 -2
View File
@@ -46,6 +46,21 @@ Zweiter Absatz...
""" """
PUNCTUATE_PROMPT = """Du bekommst einen rohen deutschen Sprachtranskript ohne Großschreibung und Satzzeichen.
Füge AUSSCHLIESSLICH Satzzeichen (Punkt, Komma, Fragezeichen, Ausrufezeichen) und Großschreibung am Satzanfang hinzu.
Verändere KEINE Wörter, kürze NICHTS, füge NICHTS hinzu.
Antworte NUR mit dem korrigierten Text, ohne Kommentar."""
def _strip_code_fences(text: str) -> str:
"""Remove markdown code fences (```json ... ```) from LLM responses."""
import re
m = re.search(r"```(?:json)?\s*([\s\S]*?)\s*```", text)
if m:
return m.group(1)
return text
class OllamaClient: class OllamaClient:
def __init__(self, base_url: str = "http://localhost:11434"): def __init__(self, base_url: str = "http://localhost:11434"):
self.base_url = base_url self.base_url = base_url
@@ -91,7 +106,7 @@ class OllamaClient:
}, },
) )
r.raise_for_status() r.raise_for_status()
raw = r.json()["response"].strip() raw = _strip_code_fences(r.json()["response"].strip())
try: try:
data = json.loads(raw) data = json.loads(raw)
title = str(data.get("title", "")).strip() or "Diktat" title = str(data.get("title", "")).strip() or "Diktat"
@@ -100,6 +115,21 @@ class OllamaClient:
except Exception: except Exception:
return "Diktat", "Kein TL;DR verfügbar." return "Diktat", "Kein TL;DR verfügbar."
async def punctuate(
self,
text: str,
model: str = "gemma3:12b",
) -> str:
"""Add punctuation and capitalisation to raw whisper output without changing words."""
async with httpx.AsyncClient(timeout=120) as client:
r = await client.post(
f"{self.base_url}/api/generate",
json={"model": model, "prompt": text, "system": PUNCTUATE_PROMPT, "stream": False},
)
r.raise_for_status()
result = r.json()["response"].strip()
return result if result else text
async def identify_speakers( async def identify_speakers(
self, self,
transcript_excerpt: str, transcript_excerpt: str,
@@ -123,7 +153,7 @@ class OllamaClient:
data = json.loads(raw) data = json.loads(raw)
if not isinstance(data, dict): if not isinstance(data, dict):
return {} return {}
return {k: v for k, v in data.items() if v} return {k: v for k, v in data.items() if v and str(v).lower() != "null"}
except Exception: except Exception:
return {} return {}