Weak V-x families stop gluing: coda-nested buckets, gated fusion

The spotlight exposed forever/question/extra/sequential as one EH-x
family. Three fixes:
- phrases obey refrain muting (the «Forever ever» carpet)
- bare V-x buckets subdivide by nesting final-coda classes, so
  placement (NT) keeps creation (N) but forever (.) releases
  sequential (L); phrases ride the main cluster
- single-vowel suffix fusion requires member coda sets to nest

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-06-07 14:40:34 -04:00
parent bd542e0b2e
commit 77316330ea
2 changed files with 120 additions and 12 deletions
+104 -12
View File
@@ -337,6 +337,19 @@ def _m2_key(seq: list[str]) -> str | None:
return f"m2:{vowels[0]} {_coda_class(ph[vi + 1])} x"
def _final_coda_tag(pl: list[str]) -> str:
"""Coda-class string after the LAST vowel ('.' for open)."""
last = -1
for i, p in enumerate(pl):
if p[-1].isdigit():
last = i
coda = "".join(_coda_class(DIGITS.sub("", p)) for p in pl[last + 1:])
return coda or "."
WEAK_MK = re.compile(r"^m:[A-Z]+ x$")
def multi_keys(word: str) -> tuple[str, ...]:
"""Multisyllabic keys across all candidate pronunciations, anchored at
the last stressed vowel AND the first primary stress — KET-a-mine can
@@ -514,6 +527,11 @@ def analyze(draft: Draft):
"sid": sids[i], "gid": None, "slant": False,
})
# words the draft leans on as refrain/filler (4+ uses) stop lighting
# up mid-line — their line-end uses still count
counts = Counter(t["word"].lower() for t in tokens)
refrain = {w for w, c in counts.items() if c >= 4}
# phrase tokens: adjacent word pairs, so multi-word rhymes can match
# single words (orange / door hinge). Anchored at the first word's
# stressed vowel; competes in the multisyllabic slant pass only.
@@ -523,6 +541,8 @@ def analyze(draft: Draft):
line_toks[t["line"]].append(t)
for toks in line_toks.values():
for a, b in zip(toks, toks[1:]):
if a["word"].lower() in refrain:
continue # «Forever ever» carpets obey refrain muting too
pa, pb = phones_for(a["word"]), phones_for(b["word"])
if not (pa and pb):
continue
@@ -544,7 +564,8 @@ def analyze(draft: Draft):
# a phrase touching a stopword ("were up") may still match
# perfectly, but never competes in the vowel-only passes
"weak": (a["word"].lower() in STOPWORDS
or b["word"].lower() in STOPWORDS),
or b["word"].lower() in STOPWORDS
or b["word"].lower() in refrain),
})
# mosaic triples: three-word runs whose vowel run is the rhyme —
@@ -554,6 +575,8 @@ def analyze(draft: Draft):
for a, b, c in zip(toks, toks[1:], toks[2:]):
if a["word"].lower() in STOPWORDS:
continue
if a["word"].lower() in refrain:
continue
pa, pb, pc = (phones_for(a["word"]), phones_for(b["word"]),
phones_for(c["word"]))
if not (pa and pb and pc):
@@ -581,11 +604,6 @@ def analyze(draft: Draft):
"weak": False,
})
# words the draft leans on as refrain/filler (4+ uses) stop lighting
# up mid-line — their line-end uses still count
counts = Counter(t["word"].lower() for t in tokens)
refrain = {w for w, c in counts.items() if c >= 4}
# pass 1: perfect rhymes (shared rime), anywhere in a line — this is
# what catches internal rhymes. Phrases compete too, so "stir up"
# perfect-rhymes "syrup" even while its "up" rhymes with "cup".
@@ -809,21 +827,67 @@ def analyze(draft: Draft):
# biggest buckets claim first (a token may sit in several via its
# anchors); distinctness by anchor word, so the phrase "fire burns"
# can't pose as a different word than the "fire" it starts with
for (sid, key), toks in sorted(by_multi.items(),
key=lambda kv: (-len(kv[1]), kv[0][1])):
def _flush_multi(toks, key):
toks = [t for t in toks if id(t) not in grouped]
if len(toks) < 2 or len({t["word"].split()[0] for t in toks}) < 2:
continue
return
# an all-phrase bucket whose members mirror the same two word
# groups is pure redundancy (oh my / go rhyme over oh+go, my+rhyme)
halves = {t.get("halves") for t in toks}
if (len(halves) == 1
and None not in halves
and None not in next(iter(halves))):
continue
return
raw_groups.append({"toks": toks, "slant": True, "key": key})
grouped.update(id(t) for t in toks)
def _word_tag(t):
ph = phones_for(t["word"])
return _final_coda_tag(ph.split()) if ph else "."
def _tags_ok(ta, tb):
if ta == tb:
return True
if ta == "." or tb == ".":
return False
return (ta.startswith(tb) or tb.startswith(ta)
or ta.endswith(tb) or tb.endswith(ta))
for (sid, key), toks in sorted(by_multi.items(),
key=lambda kv: (-len(kv[1]), kv[0][1])):
if not WEAK_MK.match(key):
_flush_multi(toks, key)
continue
# a bare V-x signature is too weak on its own: subdivide the
# bucket by nesting final-coda classes, so placement (NT) keeps
# creation (N) but forever (.) lets go of sequential (L)
words = [t for t in toks if " " not in t["word"]]
phs = [t for t in toks if " " in t["word"]]
tags = [_word_tag(t) for t in words]
par = list(range(len(words)))
def _f(i):
while par[i] != i:
par[i] = par[par[i]]
i = par[i]
return i
for i in range(len(words)):
for j in range(i + 1, len(words)):
if _tags_ok(tags[i], tags[j]):
par[_f(i)] = _f(j)
clus = defaultdict(list)
for i, t in enumerate(words):
clus[_f(i)].append(t)
subsets = sorted(clus.values(), key=len, reverse=True)
if phs:
if subsets:
subsets[0].extend(phs) # phrases ride the main cluster
else:
subsets = [phs]
for sub in subsets:
_flush_multi(sub, key)
# pass 4: consonance-aware slant anywhere in a line — last stressed
# vowel + first coda consonant, so bliss / whisps / exist (IH S) group
# even though their full codas differ
@@ -915,11 +979,34 @@ def analyze(draft: Draft):
return len(short) <= len(long) and long[len(long) - len(short):] == short
def _sig(key):
vs = key[2:].split()
vs = key[2:].split("|", 1)[0].split()
while vs and vs[-1] == "x":
vs.pop() # trailing schwas fall off the beat on both sides
return vs
def _gtags(gi):
out = set()
for t in raw_groups[gi]["toks"]:
if " " in t["word"]:
continue
ph = phones_for(t["word"])
if ph:
out.add(_final_coda_tag(ph.split()))
return out
def _tags_ok2(ta, tb):
if ta == tb:
return True
if ta == "." or tb == ".":
return False
return (ta.startswith(tb) or tb.startswith(ta)
or ta.endswith(tb) or tb.endswith(ta))
def _sets_nest(A, B):
if not A or not B:
return True # phrase-only family: no coda evidence to refuse
return any(_tags_ok2(x, y) for x in A for y in B)
for ai in range(len(raw_groups)):
if not mkeys[ai]:
continue
@@ -935,8 +1022,13 @@ def analyze(draft: Draft):
# equal keys fuse; so do END-ALIGNED containments — a family
# rhyming on AA-x is the tail of one rhyming on AE-AA-x
# (back pocket / rap profit / office), the longer just
# carries lead syllables
# carries lead syllables. But when BOTH signatures are a
# single vowel, the final codas must nest too — forever (.)
# and sequential (L) share EH-x and still aren't one family
if va == vb or _tail_of(va, vb) or _tail_of(vb, va):
if (len(va) == 1 and len(vb) == 1
and not _sets_nest(_gtags(ai), _gtags(bi))):
continue
mparent[mfind(ai)] = mfind(bi)
mclusters = defaultdict(list)
+16
View File
@@ -670,3 +670,19 @@ def test_multis_generator():
def test_multis_in_lookup_response():
data = lookup("placement", mode="rhyme")
assert data["multis"] and "basement" not in data["multis"] # perfects excluded
def test_forever_does_not_glue_to_sequential():
# both carry EH-x, but forever ends open (.) and sequential closes
# on L — and «forever (forever» phrase carpets obey refrain muting
text = ("Diamonds are forever (forever, forever)\n"
"Diamonds are forever (forever, forever, forever)\n"
"Forever ever? Forever ever? Ever, ever?\n"
"this is a question of writing style\n"
"going the extra mile\n"
"quintessential\n"
"sequential")
seq = group_with(text, "quintessential", "sequential")
assert "forever" not in seq and "ever" not in seq
assert "question" not in seq and "extra" not in seq
group_with(text, "style", "mile")