Vowel families must be local

Slant/multi attachment, multi buckets, and unequal-length containment
fusion now require members within ~6-8 lines. "All Me" was chaining
Baby/Mercedes/loan/being/Daddy into one teal family across 110 lines;
vowel assonance is local evidence, not global. Perfect rhymes and
equal-key fusion stay global (a real rhyme carries any distance).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-06-07 14:56:56 -04:00
parent fec0485b68
commit 411681d0a0
2 changed files with 51 additions and 6 deletions
+39 -6
View File
@@ -696,7 +696,8 @@ def analyze(draft: Draft):
def attach_or_collect(t, key, bucket, gmap):
gi = gmap.get((t["sid"], key))
if gi is not None:
if gi is not None and any(abs(m["line"] - t["line"]) <= 8
for m in raw_groups[gi]["toks"]):
raw_groups[gi]["toks"].append(t)
t["slant"] = True
grouped.add(id(t))
@@ -819,16 +820,40 @@ def analyze(draft: Draft):
attach_or_collect(p, key, by_multi, group_by_multi)
for (sid, key), toks in by_par.items():
toks = [t for t in toks if id(t) not in grouped]
if len(toks) >= 2 and len({t["word"].split()[0] for t in toks}) >= 2:
raw_groups.append({"toks": toks, "slant": True, "key": key})
grouped.update(id(t) for t in toks)
toks = sorted((t for t in toks if id(t) not in grouped),
key=lambda t: t["line"])
runs, cur = [], []
for t in toks:
if cur and t["line"] - cur[-1]["line"] > 6:
runs.append(cur)
cur = []
cur.append(t)
if cur:
runs.append(cur)
for run in runs:
if len(run) >= 2 and len({t["word"].split()[0] for t in run}) >= 2:
raw_groups.append({"toks": run, "slant": True, "key": key})
grouped.update(id(t) for t in run)
# biggest buckets claim first (a token may sit in several via its
# anchors); distinctness by anchor word, so the phrase "fire burns"
# can't pose as a different word than the "fire" it starts with
def _flush_multi(toks, key):
toks = [t for t in toks if id(t) not in grouped]
toks = sorted((t for t in toks if id(t) not in grouped),
key=lambda t: t["line"])
# vowel evidence is local: split on gaps of more than 6 lines
runs, cur = [], []
for t in toks:
if cur and t["line"] - cur[-1]["line"] > 6:
runs.append(cur)
cur = []
cur.append(t)
if cur:
runs.append(cur)
for run in runs:
_flush_multi_run(run, key)
def _flush_multi_run(toks, key):
if len(toks) < 2 or len({t["word"].split()[0] for t in toks}) < 2:
return
# an all-phrase bucket whose members mirror the same two word
@@ -1029,6 +1054,14 @@ def analyze(draft: Draft):
if (len(va) == 1 and len(vb) == 1
and not _sets_nest(_gtags(ai), _gtags(bi))):
continue
if va != vb:
# containment (unequal lengths) is weaker evidence
# than identity: the families must actually meet —
# Baby (l23) never fuses with Daddy (l136)
la = {t["line"] for t in raw_groups[ai]["toks"]}
lb = {t["line"] for t in raw_groups[bi]["toks"]}
if min(abs(x - y) for x in la for y in lb) > 8:
continue
mparent[mfind(ai)] = mfind(bi)
mclusters = defaultdict(list)
+12
View File
@@ -686,3 +686,15 @@ def test_forever_does_not_glue_to_sequential():
assert "forever" not in seq and "ever" not in seq
assert "question" not in seq and "extra" not in seq
group_with(text, "style", "mile")
def test_vowel_families_are_local():
# a 100-line gap can't chain a vowel family — Baby/Daddy/loan/being
# were one teal blob across all of "All Me"
text = "Baby girl I see you\n" + ("filler line here\n" * 100) + "my old daddy\n"
from collections import defaultdict
res = analyze(Draft(text=text))
bg = defaultdict(set)
for tok in res["tokens"]:
bg[tok["g"]].add(res["lines"][tok["l"]][tok["s"]:tok["e"]].lower())
assert not any({"baby", "daddy"} <= s for s in bg.values())