Weak V-x attach needs coda nesting too

garbage/javascript were joining dollar/scholar's AA-x group via the
attach path, which lacked the coda-class guard the bucket path has.
Now a bare V-x key attaches only when the token's coda nests with the
group's — same rule both paths. syrup/burden was the same looseness
(ER-schwa, mismatched coda) and now correctly splits; the real
stir-up/were-up perfect phrase matches stay.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-06-07 20:35:31 -04:00
parent e6b86a8a88
commit c77d24fb83
2 changed files with 47 additions and 8 deletions
+29 -6
View File
@@ -350,6 +350,15 @@ def _final_coda_tag(pl: list[str]) -> str:
WEAK_MK = re.compile(r"^m:[A-Z]+ x$")
def _coda_nest(ta: str, tb: str) -> bool:
if ta == tb:
return True
if ta == "." or tb == ".":
return False
return (ta.startswith(tb) or tb.startswith(ta)
or ta.endswith(tb) or tb.endswith(ta))
def multi_keys(word: str) -> tuple[str, ...]:
"""Multisyllabic keys across all candidate pronunciations, anchored at
the last stressed vowel AND the first primary stress — KET-a-mine can
@@ -757,14 +766,28 @@ def analyze(draft: Draft):
if not t["is_end"] and (w in STOPWORDS or w in refrain or len(w) < 2):
continue
keys = multi_keys(t["word"])
t_tag = _final_coda_tag(phones_for(t["word"]).split()) \
if phones_for(t["word"]) else "."
joined = False
for key in keys: # join an existing family if any anchor fits
gi = group_by_multi.get((t["sid"], key))
if gi is not None:
raw_groups[gi]["toks"].append(t)
t["slant"] = True
grouped.add(id(t))
break
else:
if gi is None:
continue
# a bare V-x key is too weak to attach on alone: the token's
# coda class must nest with the group's (garbage JH / dollar
# R don't, even though both are AA-x)
if WEAK_MK.match(key):
gtags = {_final_coda_tag(phones_for(m["word"]).split())
for m in raw_groups[gi]["toks"]
if " " not in m["word"] and phones_for(m["word"])}
if gtags and not any(_coda_nest(t_tag, gt) for gt in gtags):
continue
raw_groups[gi]["toks"].append(t)
t["slant"] = True
grouped.add(id(t))
joined = True
break
if not joined:
for key in keys:
by_multi[(t["sid"], key)].append(t)
+18 -2
View File
@@ -250,7 +250,8 @@ def test_layered_phrase_rides_second_group():
"fill the cup up to the brim\n"
"Maybe I need to stir up shit\n"
"the burden is mine")
syrup = group_with(text, "syrup", "burden", "stir up")
syrup = group_with(text, "syrup", "stir up") # perfect ER-AH-P
assert "burden" not in syrup # ER-schwa with a mismatched coda — too loose
cups = group_with(text, "cup", "up")
assert "stir up" not in cups and "syrup" not in cups
@@ -261,7 +262,7 @@ def test_weak_phrase_attaches_but_never_founds():
"fill the cup up to the brim\n"
"shake the world up if it were up to me\n"
"the burden is mine")
group_with(text, "syrup", "burden", "were up")
group_with(text, "syrup", "were up") # perfect ER-AH-P phrase match
# ...but two weak phrases alone can't create a group
assert "were up" not in highlighted("it were up to him\nit were up to her")
@@ -698,3 +699,18 @@ def test_vowel_families_are_local():
for tok in res["tokens"]:
bg[tok["g"]].add(res["lines"][tok["l"]][tok["s"]:tok["e"]].lower())
assert not any({"baby", "daddy"} <= s for s in bg.values())
def test_garbage_javascript_dont_attach():
# both AA-x, but garbage ends JH and javascript ends PT — neither
# nests with dollar/scholar's open coda, so they don't join
text = ("Garbage collected greatness, a dollar-sign disease\n"
"your JavaScript lost at sea, the notebook of the scholar")
group_with(text, "dollar", "scholar")
res = analyze(Draft(text=text))
from collections import defaultdict
bg = defaultdict(set)
for t in res["tokens"]:
bg[t["g"]].add(res["lines"][t["l"]][t["s"]:t["e"]].lower())
dfam = next(s for s in bg.values() if "dollar" in s)
assert "garbage" not in dfam and "javascript" not in dfam