fix: punctuate raw transcript, strip JSON code fences, filter null speaker names
- llm: punctuate() adds punctuation/capitalisation without changing words - llm: _strip_code_fences() handles markdown-wrapped JSON from gemma3 - llm: filter string 'null' from identify_speakers result - pipeline: punctuate raw_text in parallel with refine for solo recordings Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -46,6 +46,21 @@ Zweiter Absatz...
|
||||
"""
|
||||
|
||||
|
||||
PUNCTUATE_PROMPT = """Du bekommst einen rohen deutschen Sprachtranskript ohne Großschreibung und Satzzeichen.
|
||||
Füge AUSSCHLIESSLICH Satzzeichen (Punkt, Komma, Fragezeichen, Ausrufezeichen) und Großschreibung am Satzanfang hinzu.
|
||||
Verändere KEINE Wörter, kürze NICHTS, füge NICHTS hinzu.
|
||||
Antworte NUR mit dem korrigierten Text, ohne Kommentar."""
|
||||
|
||||
|
||||
def _strip_code_fences(text: str) -> str:
|
||||
"""Remove markdown code fences (```json ... ```) from LLM responses."""
|
||||
import re
|
||||
m = re.search(r"```(?:json)?\s*([\s\S]*?)\s*```", text)
|
||||
if m:
|
||||
return m.group(1)
|
||||
return text
|
||||
|
||||
|
||||
class OllamaClient:
|
||||
def __init__(self, base_url: str = "http://localhost:11434"):
|
||||
self.base_url = base_url
|
||||
@@ -91,7 +106,7 @@ class OllamaClient:
|
||||
},
|
||||
)
|
||||
r.raise_for_status()
|
||||
raw = r.json()["response"].strip()
|
||||
raw = _strip_code_fences(r.json()["response"].strip())
|
||||
try:
|
||||
data = json.loads(raw)
|
||||
title = str(data.get("title", "")).strip() or "Diktat"
|
||||
@@ -100,6 +115,21 @@ class OllamaClient:
|
||||
except Exception:
|
||||
return "Diktat", "Kein TL;DR verfügbar."
|
||||
|
||||
async def punctuate(
|
||||
self,
|
||||
text: str,
|
||||
model: str = "gemma3:12b",
|
||||
) -> str:
|
||||
"""Add punctuation and capitalisation to raw whisper output without changing words."""
|
||||
async with httpx.AsyncClient(timeout=120) as client:
|
||||
r = await client.post(
|
||||
f"{self.base_url}/api/generate",
|
||||
json={"model": model, "prompt": text, "system": PUNCTUATE_PROMPT, "stream": False},
|
||||
)
|
||||
r.raise_for_status()
|
||||
result = r.json()["response"].strip()
|
||||
return result if result else text
|
||||
|
||||
async def identify_speakers(
|
||||
self,
|
||||
transcript_excerpt: str,
|
||||
@@ -123,7 +153,7 @@ class OllamaClient:
|
||||
data = json.loads(raw)
|
||||
if not isinstance(data, dict):
|
||||
return {}
|
||||
return {k: v for k, v in data.items() if v}
|
||||
return {k: v for k, v in data.items() if v and str(v).lower() != "null"}
|
||||
except Exception:
|
||||
return {}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user