mirror of
https://github.com/kennethreitz/rhymepad.org.git
synced 2026-06-11 17:08:33 +00:00
Vowel families must be local
Slant/multi attachment, multi buckets, and unequal-length containment fusion now require members within ~6-8 lines. "All Me" was chaining Baby/Mercedes/loan/being/Daddy into one teal family across 110 lines; vowel assonance is local evidence, not global. Perfect rhymes and equal-key fusion stay global (a real rhyme carries any distance). Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -696,7 +696,8 @@ def analyze(draft: Draft):
|
||||
|
||||
def attach_or_collect(t, key, bucket, gmap):
|
||||
gi = gmap.get((t["sid"], key))
|
||||
if gi is not None:
|
||||
if gi is not None and any(abs(m["line"] - t["line"]) <= 8
|
||||
for m in raw_groups[gi]["toks"]):
|
||||
raw_groups[gi]["toks"].append(t)
|
||||
t["slant"] = True
|
||||
grouped.add(id(t))
|
||||
@@ -819,16 +820,40 @@ def analyze(draft: Draft):
|
||||
attach_or_collect(p, key, by_multi, group_by_multi)
|
||||
|
||||
for (sid, key), toks in by_par.items():
|
||||
toks = [t for t in toks if id(t) not in grouped]
|
||||
if len(toks) >= 2 and len({t["word"].split()[0] for t in toks}) >= 2:
|
||||
raw_groups.append({"toks": toks, "slant": True, "key": key})
|
||||
grouped.update(id(t) for t in toks)
|
||||
toks = sorted((t for t in toks if id(t) not in grouped),
|
||||
key=lambda t: t["line"])
|
||||
runs, cur = [], []
|
||||
for t in toks:
|
||||
if cur and t["line"] - cur[-1]["line"] > 6:
|
||||
runs.append(cur)
|
||||
cur = []
|
||||
cur.append(t)
|
||||
if cur:
|
||||
runs.append(cur)
|
||||
for run in runs:
|
||||
if len(run) >= 2 and len({t["word"].split()[0] for t in run}) >= 2:
|
||||
raw_groups.append({"toks": run, "slant": True, "key": key})
|
||||
grouped.update(id(t) for t in run)
|
||||
|
||||
# biggest buckets claim first (a token may sit in several via its
|
||||
# anchors); distinctness by anchor word, so the phrase "fire burns"
|
||||
# can't pose as a different word than the "fire" it starts with
|
||||
def _flush_multi(toks, key):
|
||||
toks = [t for t in toks if id(t) not in grouped]
|
||||
toks = sorted((t for t in toks if id(t) not in grouped),
|
||||
key=lambda t: t["line"])
|
||||
# vowel evidence is local: split on gaps of more than 6 lines
|
||||
runs, cur = [], []
|
||||
for t in toks:
|
||||
if cur and t["line"] - cur[-1]["line"] > 6:
|
||||
runs.append(cur)
|
||||
cur = []
|
||||
cur.append(t)
|
||||
if cur:
|
||||
runs.append(cur)
|
||||
for run in runs:
|
||||
_flush_multi_run(run, key)
|
||||
|
||||
def _flush_multi_run(toks, key):
|
||||
if len(toks) < 2 or len({t["word"].split()[0] for t in toks}) < 2:
|
||||
return
|
||||
# an all-phrase bucket whose members mirror the same two word
|
||||
@@ -1029,6 +1054,14 @@ def analyze(draft: Draft):
|
||||
if (len(va) == 1 and len(vb) == 1
|
||||
and not _sets_nest(_gtags(ai), _gtags(bi))):
|
||||
continue
|
||||
if va != vb:
|
||||
# containment (unequal lengths) is weaker evidence
|
||||
# than identity: the families must actually meet —
|
||||
# Baby (l23) never fuses with Daddy (l136)
|
||||
la = {t["line"] for t in raw_groups[ai]["toks"]}
|
||||
lb = {t["line"] for t in raw_groups[bi]["toks"]}
|
||||
if min(abs(x - y) for x in la for y in lb) > 8:
|
||||
continue
|
||||
mparent[mfind(ai)] = mfind(bi)
|
||||
|
||||
mclusters = defaultdict(list)
|
||||
|
||||
@@ -686,3 +686,15 @@ def test_forever_does_not_glue_to_sequential():
|
||||
assert "forever" not in seq and "ever" not in seq
|
||||
assert "question" not in seq and "extra" not in seq
|
||||
group_with(text, "style", "mile")
|
||||
|
||||
|
||||
def test_vowel_families_are_local():
|
||||
# a 100-line gap can't chain a vowel family — Baby/Daddy/loan/being
|
||||
# were one teal blob across all of "All Me"
|
||||
text = "Baby girl I see you\n" + ("filler line here\n" * 100) + "my old daddy\n"
|
||||
from collections import defaultdict
|
||||
res = analyze(Draft(text=text))
|
||||
bg = defaultdict(set)
|
||||
for tok in res["tokens"]:
|
||||
bg[tok["g"]].add(res["lines"][tok["l"]][tok["s"]:tok["e"]].lower())
|
||||
assert not any({"baby", "daddy"} <= s for s in bg.values())
|
||||
|
||||
Reference in New Issue
Block a user