fix: punctuate raw transcript, strip JSON code fences, filter null speaker names
- llm: punctuate() adds punctuation/capitalisation without changing words - llm: _strip_code_fences() handles markdown-wrapped JSON from gemma3 - llm: filter string 'null' from identify_speakers result - pipeline: punctuate raw_text in parallel with refine for solo recordings Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
+5
-6
@@ -74,15 +74,14 @@ async def _run_solo_pipeline(cfg, wav_path, output_dir, instructions):
|
|||||||
await broadcast({"event": "transcribed", "raw": raw_text})
|
await broadcast({"event": "transcribed", "raw": raw_text})
|
||||||
|
|
||||||
client = OllamaClient(base_url=cfg["ollama"]["base_url"])
|
client = OllamaClient(base_url=cfg["ollama"]["base_url"])
|
||||||
refined = await client.refine(
|
punctuated, refined = await asyncio.gather(
|
||||||
raw_text=raw_text,
|
client.punctuate(raw_text, model=cfg["ollama"]["model"]),
|
||||||
instructions=instructions,
|
client.refine(raw_text=raw_text, instructions=instructions, model=cfg["ollama"]["model"]),
|
||||||
model=cfg["ollama"]["model"],
|
|
||||||
)
|
)
|
||||||
|
title, tldr = await client.generate_title_and_tldr(refined, model=cfg["ollama"]["model"])
|
||||||
|
|
||||||
dt = datetime.now()
|
dt = datetime.now()
|
||||||
title, tldr = await client.generate_title_and_tldr(refined, model=cfg["ollama"]["model"])
|
paths = write_solo_docs(raw_text=punctuated, refined=refined, output_dir=output_dir, dt=dt, title=title, tldr=tldr)
|
||||||
paths = write_solo_docs(raw_text=raw_text, refined=refined, output_dir=output_dir, dt=dt, title=title, tldr=tldr)
|
|
||||||
|
|
||||||
await state.set_status(Status.IDLE)
|
await state.set_status(Status.IDLE)
|
||||||
await broadcast({
|
await broadcast({
|
||||||
|
|||||||
@@ -46,6 +46,21 @@ Zweiter Absatz...
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
PUNCTUATE_PROMPT = """Du bekommst einen rohen deutschen Sprachtranskript ohne Großschreibung und Satzzeichen.
|
||||||
|
Füge AUSSCHLIESSLICH Satzzeichen (Punkt, Komma, Fragezeichen, Ausrufezeichen) und Großschreibung am Satzanfang hinzu.
|
||||||
|
Verändere KEINE Wörter, kürze NICHTS, füge NICHTS hinzu.
|
||||||
|
Antworte NUR mit dem korrigierten Text, ohne Kommentar."""
|
||||||
|
|
||||||
|
|
||||||
|
def _strip_code_fences(text: str) -> str:
|
||||||
|
"""Remove markdown code fences (```json ... ```) from LLM responses."""
|
||||||
|
import re
|
||||||
|
m = re.search(r"```(?:json)?\s*([\s\S]*?)\s*```", text)
|
||||||
|
if m:
|
||||||
|
return m.group(1)
|
||||||
|
return text
|
||||||
|
|
||||||
|
|
||||||
class OllamaClient:
|
class OllamaClient:
|
||||||
def __init__(self, base_url: str = "http://localhost:11434"):
|
def __init__(self, base_url: str = "http://localhost:11434"):
|
||||||
self.base_url = base_url
|
self.base_url = base_url
|
||||||
@@ -91,7 +106,7 @@ class OllamaClient:
|
|||||||
},
|
},
|
||||||
)
|
)
|
||||||
r.raise_for_status()
|
r.raise_for_status()
|
||||||
raw = r.json()["response"].strip()
|
raw = _strip_code_fences(r.json()["response"].strip())
|
||||||
try:
|
try:
|
||||||
data = json.loads(raw)
|
data = json.loads(raw)
|
||||||
title = str(data.get("title", "")).strip() or "Diktat"
|
title = str(data.get("title", "")).strip() or "Diktat"
|
||||||
@@ -100,6 +115,21 @@ class OllamaClient:
|
|||||||
except Exception:
|
except Exception:
|
||||||
return "Diktat", "Kein TL;DR verfügbar."
|
return "Diktat", "Kein TL;DR verfügbar."
|
||||||
|
|
||||||
|
async def punctuate(
|
||||||
|
self,
|
||||||
|
text: str,
|
||||||
|
model: str = "gemma3:12b",
|
||||||
|
) -> str:
|
||||||
|
"""Add punctuation and capitalisation to raw whisper output without changing words."""
|
||||||
|
async with httpx.AsyncClient(timeout=120) as client:
|
||||||
|
r = await client.post(
|
||||||
|
f"{self.base_url}/api/generate",
|
||||||
|
json={"model": model, "prompt": text, "system": PUNCTUATE_PROMPT, "stream": False},
|
||||||
|
)
|
||||||
|
r.raise_for_status()
|
||||||
|
result = r.json()["response"].strip()
|
||||||
|
return result if result else text
|
||||||
|
|
||||||
async def identify_speakers(
|
async def identify_speakers(
|
||||||
self,
|
self,
|
||||||
transcript_excerpt: str,
|
transcript_excerpt: str,
|
||||||
@@ -123,7 +153,7 @@ class OllamaClient:
|
|||||||
data = json.loads(raw)
|
data = json.loads(raw)
|
||||||
if not isinstance(data, dict):
|
if not isinstance(data, dict):
|
||||||
return {}
|
return {}
|
||||||
return {k: v for k, v in data.items() if v}
|
return {k: v for k, v in data.items() if v and str(v).lower() != "null"}
|
||||||
except Exception:
|
except Exception:
|
||||||
return {}
|
return {}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user