fix: punctuate raw transcript, strip JSON code fences, filter null speaker names

- llm: punctuate() adds punctuation/capitalisation without changing words - llm: _strip_code_fences() handles markdown-wrapped JSON from gemma3 - llm: filter string 'null' from identify_speakers result - pipeline: punctuate raw_text in parallel with refine for solo recordings Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-02 12:23:25 +02:00
parent d3582eaeb7
commit 658f9be47f
2 changed files with 37 additions and 8 deletions
@@ -74,15 +74,14 @@ async def _run_solo_pipeline(cfg, wav_path, output_dir, instructions):
    await broadcast({"event": "transcribed", "raw": raw_text})
    client = OllamaClient(base_url=cfg["ollama"]["base_url"])
-    refined = await client.refine(
+    punctuated, refined = await asyncio.gather(
-        raw_text=raw_text,
+        client.punctuate(raw_text, model=cfg["ollama"]["model"]),
-        instructions=instructions,
+        client.refine(raw_text=raw_text, instructions=instructions, model=cfg["ollama"]["model"]),
        model=cfg["ollama"]["model"],
    )
    title, tldr = await client.generate_title_and_tldr(refined, model=cfg["ollama"]["model"])
    dt = datetime.now()
-    title, tldr = await client.generate_title_and_tldr(refined, model=cfg["ollama"]["model"])
+    paths = write_solo_docs(raw_text=punctuated, refined=refined, output_dir=output_dir, dt=dt, title=title, tldr=tldr)
    paths = write_solo_docs(raw_text=raw_text, refined=refined, output_dir=output_dir, dt=dt, title=title, tldr=tldr)
    await state.set_status(Status.IDLE)
    await broadcast({
@@ -46,6 +46,21 @@ Zweiter Absatz...
 """
 PUNCTUATE_PROMPT = """Du bekommst einen rohen deutschen Sprachtranskript ohne Großschreibung und Satzzeichen.
 Füge AUSSCHLIESSLICH Satzzeichen (Punkt, Komma, Fragezeichen, Ausrufezeichen) und Großschreibung am Satzanfang hinzu.
 Verändere KEINE Wörter, kürze NICHTS, füge NICHTS hinzu.
 Antworte NUR mit dem korrigierten Text, ohne Kommentar."""
 def _strip_code_fences(text: str) -> str:
    """Remove markdown code fences (```json ... ```) from LLM responses."""
    import re
    m = re.search(r"```(?:json)?\s*([\s\S]*?)\s*```", text)
    if m:
        return m.group(1)
    return text
 class OllamaClient:
    def __init__(self, base_url: str = "http://localhost:11434"):
        self.base_url = base_url
@@ -91,7 +106,7 @@ class OllamaClient:
                },
            )
            r.raise_for_status()
-            raw = r.json()["response"].strip()
+            raw = _strip_code_fences(r.json()["response"].strip())
        try:
            data = json.loads(raw)
            title = str(data.get("title", "")).strip() or "Diktat"
@@ -100,6 +115,21 @@ class OllamaClient:
        except Exception:
            return "Diktat", "Kein TL;DR verfügbar."
    async def punctuate(
        self,
        text: str,
        model: str = "gemma3:12b",
    ) -> str:
        """Add punctuation and capitalisation to raw whisper output without changing words."""
        async with httpx.AsyncClient(timeout=120) as client:
            r = await client.post(
                f"{self.base_url}/api/generate",
                json={"model": model, "prompt": text, "system": PUNCTUATE_PROMPT, "stream": False},
            )
            r.raise_for_status()
            result = r.json()["response"].strip()
        return result if result else text
    async def identify_speakers(
        self,
        transcript_excerpt: str,
@@ -123,7 +153,7 @@ class OllamaClient:
            data = json.loads(raw)
            if not isinstance(data, dict):
                return {}
-            return {k: v for k, v in data.items() if v}
+            return {k: v for k, v in data.items() if v and str(v).lower() != "null"}
        except Exception:
            return {}