mirror of
https://github.com/kennethreitz/rhymepad.org.git
synced 2026-06-11 17:08:33 +00:00
Weak V-x attach needs coda nesting too
garbage/javascript were joining dollar/scholar's AA-x group via the attach path, which lacked the coda-class guard the bucket path has. Now a bare V-x key attaches only when the token's coda nests with the group's — same rule both paths. syrup/burden was the same looseness (ER-schwa, mismatched coda) and now correctly splits; the real stir-up/were-up perfect phrase matches stay. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -350,6 +350,15 @@ def _final_coda_tag(pl: list[str]) -> str:
|
||||
WEAK_MK = re.compile(r"^m:[A-Z]+ x$")
|
||||
|
||||
|
||||
def _coda_nest(ta: str, tb: str) -> bool:
|
||||
if ta == tb:
|
||||
return True
|
||||
if ta == "." or tb == ".":
|
||||
return False
|
||||
return (ta.startswith(tb) or tb.startswith(ta)
|
||||
or ta.endswith(tb) or tb.endswith(ta))
|
||||
|
||||
|
||||
def multi_keys(word: str) -> tuple[str, ...]:
|
||||
"""Multisyllabic keys across all candidate pronunciations, anchored at
|
||||
the last stressed vowel AND the first primary stress — KET-a-mine can
|
||||
@@ -757,14 +766,28 @@ def analyze(draft: Draft):
|
||||
if not t["is_end"] and (w in STOPWORDS or w in refrain or len(w) < 2):
|
||||
continue
|
||||
keys = multi_keys(t["word"])
|
||||
t_tag = _final_coda_tag(phones_for(t["word"]).split()) \
|
||||
if phones_for(t["word"]) else "."
|
||||
joined = False
|
||||
for key in keys: # join an existing family if any anchor fits
|
||||
gi = group_by_multi.get((t["sid"], key))
|
||||
if gi is not None:
|
||||
raw_groups[gi]["toks"].append(t)
|
||||
t["slant"] = True
|
||||
grouped.add(id(t))
|
||||
break
|
||||
else:
|
||||
if gi is None:
|
||||
continue
|
||||
# a bare V-x key is too weak to attach on alone: the token's
|
||||
# coda class must nest with the group's (garbage JH / dollar
|
||||
# R don't, even though both are AA-x)
|
||||
if WEAK_MK.match(key):
|
||||
gtags = {_final_coda_tag(phones_for(m["word"]).split())
|
||||
for m in raw_groups[gi]["toks"]
|
||||
if " " not in m["word"] and phones_for(m["word"])}
|
||||
if gtags and not any(_coda_nest(t_tag, gt) for gt in gtags):
|
||||
continue
|
||||
raw_groups[gi]["toks"].append(t)
|
||||
t["slant"] = True
|
||||
grouped.add(id(t))
|
||||
joined = True
|
||||
break
|
||||
if not joined:
|
||||
for key in keys:
|
||||
by_multi[(t["sid"], key)].append(t)
|
||||
|
||||
|
||||
+18
-2
@@ -250,7 +250,8 @@ def test_layered_phrase_rides_second_group():
|
||||
"fill the cup up to the brim\n"
|
||||
"Maybe I need to stir up shit\n"
|
||||
"the burden is mine")
|
||||
syrup = group_with(text, "syrup", "burden", "stir up")
|
||||
syrup = group_with(text, "syrup", "stir up") # perfect ER-AH-P
|
||||
assert "burden" not in syrup # ER-schwa with a mismatched coda — too loose
|
||||
cups = group_with(text, "cup", "up")
|
||||
assert "stir up" not in cups and "syrup" not in cups
|
||||
|
||||
@@ -261,7 +262,7 @@ def test_weak_phrase_attaches_but_never_founds():
|
||||
"fill the cup up to the brim\n"
|
||||
"shake the world up if it were up to me\n"
|
||||
"the burden is mine")
|
||||
group_with(text, "syrup", "burden", "were up")
|
||||
group_with(text, "syrup", "were up") # perfect ER-AH-P phrase match
|
||||
# ...but two weak phrases alone can't create a group
|
||||
assert "were up" not in highlighted("it were up to him\nit were up to her")
|
||||
|
||||
@@ -698,3 +699,18 @@ def test_vowel_families_are_local():
|
||||
for tok in res["tokens"]:
|
||||
bg[tok["g"]].add(res["lines"][tok["l"]][tok["s"]:tok["e"]].lower())
|
||||
assert not any({"baby", "daddy"} <= s for s in bg.values())
|
||||
|
||||
|
||||
def test_garbage_javascript_dont_attach():
|
||||
# both AA-x, but garbage ends JH and javascript ends PT — neither
|
||||
# nests with dollar/scholar's open coda, so they don't join
|
||||
text = ("Garbage collected greatness, a dollar-sign disease\n"
|
||||
"your JavaScript lost at sea, the notebook of the scholar")
|
||||
group_with(text, "dollar", "scholar")
|
||||
res = analyze(Draft(text=text))
|
||||
from collections import defaultdict
|
||||
bg = defaultdict(set)
|
||||
for t in res["tokens"]:
|
||||
bg[t["g"]].add(res["lines"][t["l"]][t["s"]:t["e"]].lower())
|
||||
dfam = next(s for s in bg.values() if "dollar" in s)
|
||||
assert "garbage" not in dfam and "javascript" not in dfam
|
||||
|
||||
Reference in New Issue
Block a user