From 411681d0a0e431a6102e2c38551997cfd5983517 Mon Sep 17 00:00:00 2001 From: Kenneth Reitz Date: Sun, 7 Jun 2026 14:56:56 -0400 Subject: [PATCH] Vowel families must be local Slant/multi attachment, multi buckets, and unequal-length containment fusion now require members within ~6-8 lines. "All Me" was chaining Baby/Mercedes/loan/being/Daddy into one teal family across 110 lines; vowel assonance is local evidence, not global. Perfect rhymes and equal-key fusion stay global (a real rhyme carries any distance). Co-Authored-By: Claude Opus 4.8 (1M context) --- app.py | 45 ++++++++++++++++++++++++++++++++++++++------ tests/test_rhymes.py | 12 ++++++++++++ 2 files changed, 51 insertions(+), 6 deletions(-) diff --git a/app.py b/app.py index 849f256..4386afe 100644 --- a/app.py +++ b/app.py @@ -696,7 +696,8 @@ def analyze(draft: Draft): def attach_or_collect(t, key, bucket, gmap): gi = gmap.get((t["sid"], key)) - if gi is not None: + if gi is not None and any(abs(m["line"] - t["line"]) <= 8 + for m in raw_groups[gi]["toks"]): raw_groups[gi]["toks"].append(t) t["slant"] = True grouped.add(id(t)) @@ -819,16 +820,40 @@ def analyze(draft: Draft): attach_or_collect(p, key, by_multi, group_by_multi) for (sid, key), toks in by_par.items(): - toks = [t for t in toks if id(t) not in grouped] - if len(toks) >= 2 and len({t["word"].split()[0] for t in toks}) >= 2: - raw_groups.append({"toks": toks, "slant": True, "key": key}) - grouped.update(id(t) for t in toks) + toks = sorted((t for t in toks if id(t) not in grouped), + key=lambda t: t["line"]) + runs, cur = [], [] + for t in toks: + if cur and t["line"] - cur[-1]["line"] > 6: + runs.append(cur) + cur = [] + cur.append(t) + if cur: + runs.append(cur) + for run in runs: + if len(run) >= 2 and len({t["word"].split()[0] for t in run}) >= 2: + raw_groups.append({"toks": run, "slant": True, "key": key}) + grouped.update(id(t) for t in run) # biggest buckets claim first (a token may sit in several via its # anchors); distinctness by anchor word, so the phrase "fire burns" # can't pose as a different word than the "fire" it starts with def _flush_multi(toks, key): - toks = [t for t in toks if id(t) not in grouped] + toks = sorted((t for t in toks if id(t) not in grouped), + key=lambda t: t["line"]) + # vowel evidence is local: split on gaps of more than 6 lines + runs, cur = [], [] + for t in toks: + if cur and t["line"] - cur[-1]["line"] > 6: + runs.append(cur) + cur = [] + cur.append(t) + if cur: + runs.append(cur) + for run in runs: + _flush_multi_run(run, key) + + def _flush_multi_run(toks, key): if len(toks) < 2 or len({t["word"].split()[0] for t in toks}) < 2: return # an all-phrase bucket whose members mirror the same two word @@ -1029,6 +1054,14 @@ def analyze(draft: Draft): if (len(va) == 1 and len(vb) == 1 and not _sets_nest(_gtags(ai), _gtags(bi))): continue + if va != vb: + # containment (unequal lengths) is weaker evidence + # than identity: the families must actually meet — + # Baby (l23) never fuses with Daddy (l136) + la = {t["line"] for t in raw_groups[ai]["toks"]} + lb = {t["line"] for t in raw_groups[bi]["toks"]} + if min(abs(x - y) for x in la for y in lb) > 8: + continue mparent[mfind(ai)] = mfind(bi) mclusters = defaultdict(list) diff --git a/tests/test_rhymes.py b/tests/test_rhymes.py index 0561927..f6818bf 100644 --- a/tests/test_rhymes.py +++ b/tests/test_rhymes.py @@ -686,3 +686,15 @@ def test_forever_does_not_glue_to_sequential(): assert "forever" not in seq and "ever" not in seq assert "question" not in seq and "extra" not in seq group_with(text, "style", "mile") + + +def test_vowel_families_are_local(): + # a 100-line gap can't chain a vowel family — Baby/Daddy/loan/being + # were one teal blob across all of "All Me" + text = "Baby girl I see you\n" + ("filler line here\n" * 100) + "my old daddy\n" + from collections import defaultdict + res = analyze(Draft(text=text)) + bg = defaultdict(set) + for tok in res["tokens"]: + bg[tok["g"]].add(res["lines"][tok["l"]][tok["s"]:tok["e"]].lower()) + assert not any({"baby", "daddy"} <= s for s in bg.values())