diff --git a/app.py b/app.py index 45608d9..554f1ed 100644 --- a/app.py +++ b/app.py @@ -350,6 +350,15 @@ def _final_coda_tag(pl: list[str]) -> str: WEAK_MK = re.compile(r"^m:[A-Z]+ x$") +def _coda_nest(ta: str, tb: str) -> bool: + if ta == tb: + return True + if ta == "." or tb == ".": + return False + return (ta.startswith(tb) or tb.startswith(ta) + or ta.endswith(tb) or tb.endswith(ta)) + + def multi_keys(word: str) -> tuple[str, ...]: """Multisyllabic keys across all candidate pronunciations, anchored at the last stressed vowel AND the first primary stress — KET-a-mine can @@ -757,14 +766,28 @@ def analyze(draft: Draft): if not t["is_end"] and (w in STOPWORDS or w in refrain or len(w) < 2): continue keys = multi_keys(t["word"]) + t_tag = _final_coda_tag(phones_for(t["word"]).split()) \ + if phones_for(t["word"]) else "." + joined = False for key in keys: # join an existing family if any anchor fits gi = group_by_multi.get((t["sid"], key)) - if gi is not None: - raw_groups[gi]["toks"].append(t) - t["slant"] = True - grouped.add(id(t)) - break - else: + if gi is None: + continue + # a bare V-x key is too weak to attach on alone: the token's + # coda class must nest with the group's (garbage JH / dollar + # R don't, even though both are AA-x) + if WEAK_MK.match(key): + gtags = {_final_coda_tag(phones_for(m["word"]).split()) + for m in raw_groups[gi]["toks"] + if " " not in m["word"] and phones_for(m["word"])} + if gtags and not any(_coda_nest(t_tag, gt) for gt in gtags): + continue + raw_groups[gi]["toks"].append(t) + t["slant"] = True + grouped.add(id(t)) + joined = True + break + if not joined: for key in keys: by_multi[(t["sid"], key)].append(t) diff --git a/tests/test_rhymes.py b/tests/test_rhymes.py index f6818bf..9f0b51d 100644 --- a/tests/test_rhymes.py +++ b/tests/test_rhymes.py @@ -250,7 +250,8 @@ def test_layered_phrase_rides_second_group(): "fill the cup up to the brim\n" "Maybe I need to stir up shit\n" "the burden is mine") - syrup = group_with(text, "syrup", "burden", "stir up") + syrup = group_with(text, "syrup", "stir up") # perfect ER-AH-P + assert "burden" not in syrup # ER-schwa with a mismatched coda — too loose cups = group_with(text, "cup", "up") assert "stir up" not in cups and "syrup" not in cups @@ -261,7 +262,7 @@ def test_weak_phrase_attaches_but_never_founds(): "fill the cup up to the brim\n" "shake the world up if it were up to me\n" "the burden is mine") - group_with(text, "syrup", "burden", "were up") + group_with(text, "syrup", "were up") # perfect ER-AH-P phrase match # ...but two weak phrases alone can't create a group assert "were up" not in highlighted("it were up to him\nit were up to her") @@ -698,3 +699,18 @@ def test_vowel_families_are_local(): for tok in res["tokens"]: bg[tok["g"]].add(res["lines"][tok["l"]][tok["s"]:tok["e"]].lower()) assert not any({"baby", "daddy"} <= s for s in bg.values()) + + +def test_garbage_javascript_dont_attach(): + # both AA-x, but garbage ends JH and javascript ends PT — neither + # nests with dollar/scholar's open coda, so they don't join + text = ("Garbage collected greatness, a dollar-sign disease\n" + "your JavaScript lost at sea, the notebook of the scholar") + group_with(text, "dollar", "scholar") + res = analyze(Draft(text=text)) + from collections import defaultdict + bg = defaultdict(set) + for t in res["tokens"]: + bg[t["g"]].add(res["lines"][t["l"]][t["s"]:t["e"]].lower()) + dfam = next(s for s in bg.values() if "dollar" in s) + assert "garbage" not in dfam and "javascript" not in dfam