Cot-caught merger + end-position assonance for phrases

AO merges into AA except before R (thought/lot, off/forgotten; car/ core stay apart). Line-ending phrases may rhyme on their pure vowel run like end words always could — "forgotten" / "off of" lands. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-11 17:08:33 +00:00 · 2026-06-07 03:51:19 -04:00
parent e49e5f2844
commit 35b68cf7b8
2 changed files with 47 additions and 7 deletions
@@ -71,17 +71,22 @@ STOPWORDS = frozenset(
 # --------------------------------------------------------------------------

 def _norm_r(phones: str) -> str:
-    """Neutralize vowel contrasts that English loses before R: CMU has
-    fear = F IH1 R but hear = HH IY1 R, yet they rhyme in every dialect
-    (the NEAR vowel). Same for UH/UW (cure, tour)."""
+    """Neutralize contrasts most American English doesn't keep: IH/IY and
+    UH/UW merge before R (fear/hear, cure/tour — the NEAR vowel), and
+    AO merges into AA everywhere else (the cot-caught merger: thought/
+    lot, off/forgotten). Before R the AA/AO split survives (car/core)."""
    pl = phones.split()
-    for i in range(len(pl) - 1):
-        if pl[i + 1][0] == "R" and pl[i][-1].isdigit():
-            base, stress = pl[i][:-1], pl[i][-1]
+    for i in range(len(pl)):
+        if not pl[i][-1].isdigit():
+            continue
+        base, stress = pl[i][:-1], pl[i][-1]
+        if i + 1 < len(pl) and pl[i + 1][0] == "R":
            if base == "IH":
                pl[i] = "IY" + stress
            elif base == "UH":
                pl[i] = "UW" + stress
+        elif base == "AO":
+            pl[i] = "AA" + stress
    return " ".join(pl)


@@ -656,10 +661,31 @@ def analyze(draft: Draft):
            else:
                for key in filter(None, keys):
                    by_slant[(t["sid"], key)].append(t)
+    # line-ending PHRASES get the same end-position privilege as words:
+    # pure vowel-run matching ("forgotten" / "off of" — AA-schwa)
+    end_spans = defaultdict(list)
+    for g in raw_groups:
+        for t in g["toks"]:
+            if " " not in t["word"]:
+                end_spans[t["line"]].append((t["start"], t["end"]))
+    for p in phrases:
+        if not p["is_end"] or id(p) in grouped or len(p["vowels"]) < 2:
+            continue
+        if any(s < p["end"] <= e for s, e in end_spans[p["line"]]):
+            continue  # the tail word already claimed this line's ending
+        key = "v:" + " ".join(p["vowels"])
+        gi = group_by_slant.get((p["sid"], key))
+        if gi is not None:
+            raw_groups[gi]["toks"].append(p)
+            p["slant"] = True
+            grouped.add(id(p))
+        else:
+            by_slant[(p["sid"], key)].append(p)
+
    for (sid, key), toks in sorted(by_slant.items(),
                                   key=lambda kv: (-len(kv[1]), kv[0][1])):
        toks = [t for t in toks if id(t) not in grouped]
-        if len(toks) >= 2 and len({t["word"].lower() for t in toks}) >= 2:
+        if len(toks) >= 2 and len({t["word"].split()[0] for t in toks}) >= 2:
            raw_groups.append({"toks": toks, "slant": True, "key": key})
            grouped.update(id(t) for t in toks)

@@ -445,3 +445,17 @@ def test_weak_ending_requires_matching_coda():
            "Weaving heaven among hellish screams")
    fam = group_with(text, "entirety", "divinity")
    assert "screams" not in fam
+
+
+def test_cot_caught_merger():
+    # thought (AO T) and lot (AA T) are identical in merged dialects
+    group_with("I thought about it a lot\nI gave it everything I got",
+               "thought", "lot", "got")
+
+
+def test_end_phrase_pure_vowel_rhyme():
+    # forgotten / "off of": AA-schwa at both line ends — assonance is
+    # allowed at the line boundary, like it is for single words
+    text = ("I realize shit and write it down to be forgotten\n"
+            "Building a database to reference my life off of")
+    group_with(text, "forgotten", "off of")