feat: align_segments() — map Whisper timestamps to pyannote speakers
This commit is contained in:
@@ -0,0 +1,32 @@
|
|||||||
|
def align_segments(
|
||||||
|
whisper_segs: list[dict],
|
||||||
|
speaker_segs: list[tuple[float, float, str]],
|
||||||
|
) -> list[tuple[str, str]]:
|
||||||
|
"""Assign each Whisper segment to the speaker with the greatest time overlap.
|
||||||
|
Consecutive segments from the same speaker are merged into one block."""
|
||||||
|
result: list[tuple[str, str]] = []
|
||||||
|
for seg in whisper_segs:
|
||||||
|
speaker = _best_speaker(seg["start"], seg["end"], speaker_segs)
|
||||||
|
text = seg["text"].strip()
|
||||||
|
if not text:
|
||||||
|
continue
|
||||||
|
if result and result[-1][0] == speaker:
|
||||||
|
result[-1] = (speaker, result[-1][1] + " " + text)
|
||||||
|
else:
|
||||||
|
result.append((speaker, text))
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def _best_speaker(
|
||||||
|
start: float,
|
||||||
|
end: float,
|
||||||
|
speaker_segs: list[tuple[float, float, str]],
|
||||||
|
) -> str:
|
||||||
|
best_label = "SPEAKER_00"
|
||||||
|
best_overlap = 0.0
|
||||||
|
for s_start, s_end, label in speaker_segs:
|
||||||
|
overlap = max(0.0, min(end, s_end) - max(start, s_start))
|
||||||
|
if overlap > best_overlap:
|
||||||
|
best_overlap = overlap
|
||||||
|
best_label = label
|
||||||
|
return best_label
|
||||||
@@ -0,0 +1,33 @@
|
|||||||
|
def test_align_assigns_speaker_by_overlap():
|
||||||
|
from alignment import align_segments
|
||||||
|
whisper = [
|
||||||
|
{"start": 0.0, "end": 2.0, "text": "Hallo"},
|
||||||
|
{"start": 2.1, "end": 4.0, "text": "Wie geht es"},
|
||||||
|
]
|
||||||
|
speakers = [
|
||||||
|
(0.0, 2.5, "SPEAKER_00"),
|
||||||
|
(2.5, 5.0, "SPEAKER_01"),
|
||||||
|
]
|
||||||
|
result = align_segments(whisper, speakers)
|
||||||
|
assert result[0] == ("SPEAKER_00", "Hallo")
|
||||||
|
assert result[1] == ("SPEAKER_01", "Wie geht es")
|
||||||
|
|
||||||
|
|
||||||
|
def test_align_merges_consecutive_same_speaker():
|
||||||
|
from alignment import align_segments
|
||||||
|
whisper = [
|
||||||
|
{"start": 0.0, "end": 1.0, "text": "Hallo"},
|
||||||
|
{"start": 1.1, "end": 2.0, "text": "Welt"},
|
||||||
|
]
|
||||||
|
speakers = [(0.0, 3.0, "SPEAKER_00")]
|
||||||
|
result = align_segments(whisper, speakers)
|
||||||
|
assert len(result) == 1
|
||||||
|
assert result[0] == ("SPEAKER_00", "Hallo Welt")
|
||||||
|
|
||||||
|
|
||||||
|
def test_align_fallback_when_no_speaker_overlap():
|
||||||
|
from alignment import align_segments
|
||||||
|
whisper = [{"start": 0.0, "end": 1.0, "text": "Hallo"}]
|
||||||
|
speakers = []
|
||||||
|
result = align_segments(whisper, speakers)
|
||||||
|
assert result[0][0] == "SPEAKER_00"
|
||||||
Reference in New Issue
Block a user