Weak V-x attach needs coda nesting too

garbage/javascript were joining dollar/scholar's AA-x group via the attach path, which lacked the coda-class guard the bucket path has. Now a bare V-x key attaches only when the token's coda nests with the group's — same rule both paths. syrup/burden was the same looseness (ER-schwa, mismatched coda) and now correctly splits; the real stir-up/were-up perfect phrase matches stay. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-11 17:08:33 +00:00 · 2026-06-07 20:35:31 -04:00
parent e6b86a8a88
commit c77d24fb83
2 changed files with 47 additions and 8 deletions
@@ -350,6 +350,15 @@ def _final_coda_tag(pl: list[str]) -> str:
 WEAK_MK = re.compile(r"^m:[A-Z]+ x$")


+def _coda_nest(ta: str, tb: str) -> bool:
+    if ta == tb:
+        return True
+    if ta == "." or tb == ".":
+        return False
+    return (ta.startswith(tb) or tb.startswith(ta)
+            or ta.endswith(tb) or tb.endswith(ta))
+
+
 def multi_keys(word: str) -> tuple[str, ...]:
    """Multisyllabic keys across all candidate pronunciations, anchored at
    the last stressed vowel AND the first primary stress — KET-a-mine can
@@ -757,14 +766,28 @@ def analyze(draft: Draft):
        if not t["is_end"] and (w in STOPWORDS or w in refrain or len(w) < 2):
            continue
        keys = multi_keys(t["word"])
+        t_tag = _final_coda_tag(phones_for(t["word"]).split()) \
+            if phones_for(t["word"]) else "."
+        joined = False
        for key in keys:  # join an existing family if any anchor fits
            gi = group_by_multi.get((t["sid"], key))
-            if gi is not None:
-                raw_groups[gi]["toks"].append(t)
-                t["slant"] = True
-                grouped.add(id(t))
-                break
-        else:
+            if gi is None:
+                continue
+            # a bare V-x key is too weak to attach on alone: the token's
+            # coda class must nest with the group's (garbage JH / dollar
+            # R don't, even though both are AA-x)
+            if WEAK_MK.match(key):
+                gtags = {_final_coda_tag(phones_for(m["word"]).split())
+                         for m in raw_groups[gi]["toks"]
+                         if " " not in m["word"] and phones_for(m["word"])}
+                if gtags and not any(_coda_nest(t_tag, gt) for gt in gtags):
+                    continue
+            raw_groups[gi]["toks"].append(t)
+            t["slant"] = True
+            grouped.add(id(t))
+            joined = True
+            break
+        if not joined:
            for key in keys:
                by_multi[(t["sid"], key)].append(t)

@@ -250,7 +250,8 @@ def test_layered_phrase_rides_second_group():
            "fill the cup up to the brim\n"
            "Maybe I need to stir up shit\n"
            "the burden is mine")
-    syrup = group_with(text, "syrup", "burden", "stir up")
+    syrup = group_with(text, "syrup", "stir up")  # perfect ER-AH-P
+    assert "burden" not in syrup  # ER-schwa with a mismatched coda — too loose
    cups = group_with(text, "cup", "up")
    assert "stir up" not in cups and "syrup" not in cups

@@ -261,7 +262,7 @@ def test_weak_phrase_attaches_but_never_founds():
            "fill the cup up to the brim\n"
            "shake the world up if it were up to me\n"
            "the burden is mine")
-    group_with(text, "syrup", "burden", "were up")
+    group_with(text, "syrup", "were up")  # perfect ER-AH-P phrase match
    # ...but two weak phrases alone can't create a group
    assert "were up" not in highlighted("it were up to him\nit were up to her")

@@ -698,3 +699,18 @@ def test_vowel_families_are_local():
    for tok in res["tokens"]:
        bg[tok["g"]].add(res["lines"][tok["l"]][tok["s"]:tok["e"]].lower())
    assert not any({"baby", "daddy"} <= s for s in bg.values())
+
+
+def test_garbage_javascript_dont_attach():
+    # both AA-x, but garbage ends JH and javascript ends PT — neither
+    # nests with dollar/scholar's open coda, so they don't join
+    text = ("Garbage collected greatness, a dollar-sign disease\n"
+            "your JavaScript lost at sea, the notebook of the scholar")
+    group_with(text, "dollar", "scholar")
+    res = analyze(Draft(text=text))
+    from collections import defaultdict
+    bg = defaultdict(set)
+    for t in res["tokens"]:
+        bg[t["g"]].add(res["lines"][t["l"]][t["s"]:t["e"]].lower())
+    dfam = next(s for s in bg.values() if "dollar" in s)
+    assert "garbage" not in dfam and "javascript" not in dfam