From b8cc8a3b33156567c960b6427367936f5fdb60ed Mon Sep 17 00:00:00 2001 From: "thomas.kopp" Date: Thu, 2 Apr 2026 01:00:58 +0200 Subject: [PATCH] =?UTF-8?q?feat:=20align=5Fsegments()=20=E2=80=94=20map=20?= =?UTF-8?q?Whisper=20timestamps=20to=20pyannote=20speakers?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- alignment.py | 32 ++++++++++++++++++++++++++++++++ tests/test_alignment.py | 33 +++++++++++++++++++++++++++++++++ 2 files changed, 65 insertions(+) create mode 100644 alignment.py create mode 100644 tests/test_alignment.py diff --git a/alignment.py b/alignment.py new file mode 100644 index 0000000..7c71feb --- /dev/null +++ b/alignment.py @@ -0,0 +1,32 @@ +def align_segments( + whisper_segs: list[dict], + speaker_segs: list[tuple[float, float, str]], +) -> list[tuple[str, str]]: + """Assign each Whisper segment to the speaker with the greatest time overlap. + Consecutive segments from the same speaker are merged into one block.""" + result: list[tuple[str, str]] = [] + for seg in whisper_segs: + speaker = _best_speaker(seg["start"], seg["end"], speaker_segs) + text = seg["text"].strip() + if not text: + continue + if result and result[-1][0] == speaker: + result[-1] = (speaker, result[-1][1] + " " + text) + else: + result.append((speaker, text)) + return result + + +def _best_speaker( + start: float, + end: float, + speaker_segs: list[tuple[float, float, str]], +) -> str: + best_label = "SPEAKER_00" + best_overlap = 0.0 + for s_start, s_end, label in speaker_segs: + overlap = max(0.0, min(end, s_end) - max(start, s_start)) + if overlap > best_overlap: + best_overlap = overlap + best_label = label + return best_label diff --git a/tests/test_alignment.py b/tests/test_alignment.py new file mode 100644 index 0000000..c303199 --- /dev/null +++ b/tests/test_alignment.py @@ -0,0 +1,33 @@ +def test_align_assigns_speaker_by_overlap(): + from alignment import align_segments + whisper = [ + {"start": 0.0, "end": 2.0, "text": "Hallo"}, + {"start": 2.1, "end": 4.0, "text": "Wie geht es"}, + ] + speakers = [ + (0.0, 2.5, "SPEAKER_00"), + (2.5, 5.0, "SPEAKER_01"), + ] + result = align_segments(whisper, speakers) + assert result[0] == ("SPEAKER_00", "Hallo") + assert result[1] == ("SPEAKER_01", "Wie geht es") + + +def test_align_merges_consecutive_same_speaker(): + from alignment import align_segments + whisper = [ + {"start": 0.0, "end": 1.0, "text": "Hallo"}, + {"start": 1.1, "end": 2.0, "text": "Welt"}, + ] + speakers = [(0.0, 3.0, "SPEAKER_00")] + result = align_segments(whisper, speakers) + assert len(result) == 1 + assert result[0] == ("SPEAKER_00", "Hallo Welt") + + +def test_align_fallback_when_no_speaker_overlap(): + from alignment import align_segments + whisper = [{"start": 0.0, "end": 1.0, "text": "Hallo"}] + speakers = [] + result = align_segments(whisper, speakers) + assert result[0][0] == "SPEAKER_00"