From b204c8d9b98479ce3bb638ef7ac34de8f74d47fe Mon Sep 17 00:00:00 2001
From: Kenneth Reitz <me@kennethreitz.org>
Date: Sun, 7 Jun 2026 13:23:29 -0400
Subject: [PATCH] Multis lookup and per-line syllable counts
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Multis: dictionary search by vowel skeleton — single words and
  two-word combos matching the target's run from its stressed anchor
  (charisma -> little/women/business; vacation plans -> information
  back). Perfect rhymes excluded; phrases supported as targets; new
  "multis" row in the rhymes section. Indexes warm at boot.
- Counts toggle: syllables per line in dim digits at the editor's
  right edge, wrap-safe and live; carried into the PNG export.
- Suffix fusion: multi families whose vowel keys are end-aligned
  containments merge (back pocket / rap profit / off it / still in
  office = one Em chain), with trailing schwas stripped before
  comparison; the longest end token now owns the scheme slot.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 app.py               | 197 +++++++++++++++++++++++++++++++++++++------
 static/index.html    |  43 +++++++++-
 tests/test_rhymes.py |  28 ++++++
 3 files changed, 242 insertions(+), 26 deletions(-)

diff --git a/app.py b/app.py
index 901a03c..a075ccf 100644
--- a/app.py
+++ b/app.py
@@ -32,6 +32,7 @@ async def lifespan(app: FastAPI):
     except Exception:
         pass
     get_slant_index()
+    get_multi_indexes()
     yield
 
 
@@ -901,25 +902,63 @@ def analyze(draft: Draft):
     # (shoulder/older/colder) shouldn't split colors with the slant family
     # it lives inside (soldier/holster/coaster). Perfect members keep the
     # strong styling; the slant side keeps per-token slant marks.
-    by_family: dict[str, dict] = {}
-    fused: list[dict] = []
-    for g in raw_groups:
-        mk = founding_projections(g["key"]).get("multi")
-        tgt = by_family.get(mk) if mk else None
-        if tgt is None:
-            if mk:
-                by_family[mk] = g
-            fused.append(g)
+    mkeys = [founding_projections(g["key"]).get("multi") for g in raw_groups]
+    mparent = list(range(len(raw_groups)))
+
+    def mfind(i):
+        while mparent[i] != i:
+            mparent[i] = mparent[mparent[i]]
+            i = mparent[i]
+        return i
+
+    def _tail_of(short, long):
+        return len(short) <= len(long) and long[len(long) - len(short):] == short
+
+    def _sig(key):
+        vs = key[2:].split()
+        while vs and vs[-1] == "x":
+            vs.pop()  # trailing schwas fall off the beat on both sides
+        return vs
+
+    for ai in range(len(raw_groups)):
+        if not mkeys[ai]:
             continue
-        if g["slant"]:
-            for t in g["toks"]:
-                t["slant"] = True
-        elif tgt["slant"]:
-            for t in tgt["toks"]:
-                t["slant"] = True
-            tgt["slant"] = False
-            tgt["key"] = g["key"]
-        tgt["toks"].extend(g["toks"])
+        va = _sig(mkeys[ai])
+        if not va:
+            continue
+        for bi in range(ai + 1, len(raw_groups)):
+            if not mkeys[bi]:
+                continue
+            vb = _sig(mkeys[bi])
+            if not vb:
+                continue
+            # equal keys fuse; so do END-ALIGNED containments — a family
+            # rhyming on AA-x is the tail of one rhyming on AE-AA-x
+            # (back pocket / rap profit / office), the longer just
+            # carries lead syllables
+            if va == vb or _tail_of(va, vb) or _tail_of(vb, va):
+                mparent[mfind(ai)] = mfind(bi)
+
+    mclusters = defaultdict(list)
+    for gi in range(len(raw_groups)):
+        mclusters[mfind(gi)].append(gi)
+    fused: list[dict] = []
+    for members in mclusters.values():
+        if len(members) == 1:
+            fused.append(raw_groups[members[0]])
+            continue
+        hub = max(members, key=lambda gi: (not raw_groups[gi]["slant"],
+                                           len(raw_groups[gi]["toks"])))
+        core = raw_groups[hub]
+        for gi in members:
+            if gi == hub:
+                continue
+            g = raw_groups[gi]
+            if g["slant"]:
+                for t in g["toks"]:
+                    t["slant"] = True
+            core["toks"].extend(g["toks"])
+        fused.append(core)
     raw_groups = fused
 
     # fuse single-vowel perfect families whose coda classes NEST — the
@@ -1029,12 +1068,17 @@ def analyze(draft: Draft):
             t["gid"] = gid
         groups_out.append({"id": gid, "color": gid % COLORS, "slant": g["slant"]})
 
-    # stanza rhyme schemes from line-ending groups
-    # (a grouped end word wins over a grouped end phrase)
-    end_gid: dict[int, int] = {}
+    # stanza rhyme schemes from line-ending groups: the token covering
+    # the most of the line's tail owns the slot — Em rhymes "-cock it",
+    # not "it"
+    end_best: dict[int, tuple[int, int]] = {}
     for t in [*tokens, *phrases]:
         if t["is_end"] and t["gid"] is not None:
-            end_gid.setdefault(t["line"], t["gid"])
+            span = t["end"] - t["start"]
+            cur = end_best.get(t["line"])
+            if cur is None or span > cur[0]:
+                end_best[t["line"]] = (span, t["gid"])
+    end_gid = {ln: gid for ln, (sp, gid) in end_best.items()}
     last_tok = {}
     for t in tokens:
         if t["is_end"]:
@@ -1340,6 +1384,109 @@ def word_info(word: str):
             "zipf": round(zipf_frequency(w, "en"), 1)}
 
 
+_multi_left: dict[str, list[str]] | None = None
+_multi_right: dict[str, list[str]] | None = None
+
+
+def _squeeze_vs(vs: list[str]) -> str:
+    """Vowel skeleton: first vowel exact, later reduced vowels merge to x
+    — the same equivalence the multi detection passes use."""
+    return " ".join([vs[0]] + ["x" if v in REDUCED else v for v in vs[1:]])
+
+
+def get_multi_indexes():
+    """tail-skeleton -> words (left halves) and full-skeleton -> words
+    (right halves / whole-word matches), built once."""
+    global _multi_left, _multi_right
+    if _multi_left is None:
+        pronouncing.init_cmu()
+        left: dict[str, list[str]] = defaultdict(list)
+        right: dict[str, list[str]] = defaultdict(list)
+        seen = set()
+        for w, phones in pronouncing.pronunciations:
+            if (w in seen or not w.isalpha() or len(w) < 3
+                    or zipf_frequency(w, "en") < 3.0):
+                continue
+            seen.add(w)
+            ph = _norm_r(phones)
+            tail = _tail_vowels(ph)
+            if tail:
+                left[_squeeze_vs(tail)].append(w)
+            full = _all_vowels(ph)
+            stressed = any(p[-1] in "12" for p in ph.split())
+            if full and stressed:
+                right[_squeeze_vs(full)].append(w)
+        _multi_left, _multi_right = left, right
+    return _multi_left, _multi_right
+
+
+def target_skeleton(w: str) -> list[str] | None:
+    """Vowel run from the first primary stress — where a rap multi
+    anchors (e-LE-va-tor reads from its EH)."""
+    if " " in w:
+        parts = w.split()
+        pa = phones_for(parts[0])
+        rest = [phones_for(p) for p in parts[1:]]
+        if not pa or not all(rest):
+            return None
+        vs = _tail_vowels(pa)
+        for ph in rest:
+            vs += _all_vowels(ph)
+        return vs if len(vs) >= 2 else None
+    ph = phones_for(w)
+    if not ph:
+        return None
+    pl = ph.split()
+    vi = next((i for i, p in enumerate(pl) if p[-1] == "1"),
+              next((i for i, p in enumerate(pl) if p[-1].isdigit()), None))
+    if vi is None:
+        return None
+    vs = [DIGITS.sub("", p) for p in pl[vi:] if p[-1].isdigit()]
+    return vs if len(vs) >= 2 else None
+
+
+def multis_for(w: str, exclude: set, limit: int = 14) -> list[str]:
+    """Multisyllabic rhymes: single words and two-word combos whose
+    vowel skeleton matches the target's (elevator -> hella paper)."""
+    vs = target_skeleton(w)
+    if not vs:
+        return []
+    skel = _squeeze_vs(vs)
+    left, right = get_multi_indexes()
+    avoid = set(exclude) | set(w.split()) | {w}
+    scored: list[tuple[float, str]] = []
+    for cand in right.get(skel, []):
+        if cand not in avoid:
+            scored.append((zipf_frequency(cand, "en"), cand))
+    parts = skel.split()
+    for i in range(1, len(parts)):
+        lk, rk = " ".join(parts[:i]), " ".join(parts[i:])
+        if not any(v != "x" for v in parts[i:]):
+            continue  # the right half must carry a full vowel
+        lefts = sorted((w2 for w2 in left.get(lk, [])
+                        if w2 not in STOPWORDS and w2 not in avoid),
+                       key=lambda w2: -zipf_frequency(w2, "en"))[:8]
+        rights = sorted((w2 for w2 in right.get(rk, [])
+                         if w2 not in STOPWORDS and w2 not in avoid),
+                        key=lambda w2: -zipf_frequency(w2, "en"))[:8]
+        for a in lefts:
+            za = min(zipf_frequency(a, "en"), 5.0)
+            for b in rights:
+                if b == a:
+                    continue
+                scored.append((za + min(zipf_frequency(b, "en"), 5.0) - 4.0,
+                               f"{a} {b}"))
+    scored.sort(key=lambda t: (-t[0], -len(t[1]), t[1]))
+    out, seen = [], set()
+    for _, c in scored:
+        if c not in seen:
+            seen.add(c)
+            out.append(c)
+        if len(out) >= limit:
+            break
+    return out
+
+
 @app.get("/api/lookup")
 def lookup(word: str, mode: str = "rhyme", limit: int = 60):
     w = word.strip().lower()[:64]
@@ -1361,11 +1508,13 @@ def lookup(word: str, mode: str = "rhyme", limit: int = 60):
     if mode == "near":
         words = _ranked(near_cands, {w}, limit)
         return {"word": w, "mode": mode, "known": True, "words": words}
-    # rhyme mode carries both: perfect in "words", slant in "near"
+    # rhyme mode carries it all: perfect, slant, and multis
     words = _ranked(perfect, {w}, limit)
     near = _ranked(near_cands, {w}, limit // 2)
+    target = word.strip().lower()[:64] if rhyme_on else w
+    multis = multis_for(target, perfect)
     return {"word": w, "mode": mode, "known": True, "words": words,
-            "near": near, "rhyme_on": rhyme_on}
+            "near": near, "rhyme_on": rhyme_on, "multis": multis}
 
 
 app.mount("/", StaticFiles(directory=Path(__file__).parent / "static",
diff --git a/static/index.html b/static/index.html
index cdcd524..81a81ec 100644
--- a/static/index.html
+++ b/static/index.html
@@ -114,6 +114,11 @@
     z-index: 1;
   }
   #highlight .anno { color: #6a5f52; }
+  .sylcount {
+    position: absolute; right: 9px;
+    font-size: 10px; line-height: 1.6; color: #5d5347;
+    pointer-events: none;
+  }
   #editor::selection { background: rgba(232,129,74,0.22); color: transparent; }
   #stresslayer {
     pointer-events: none;
@@ -289,6 +294,7 @@ Double-click any word to look it up on the right."></textarea>
       <label class="mtoggle" title="Color-code rhyme families"><input type="checkbox" id="rhymeToggle" checked> rhyme</label>
       <label class="mtoggle" title="Underline words that share an initial sound"><input type="checkbox" id="allitToggle"> alliteration</label>
       <label class="mtoggle" title="Sheet music for your flow — syllable emphasis dots under each word"><input type="checkbox" id="stressToggle"> rhythm</label>
+      <label class="mtoggle" title="Syllables per line, at the right edge"><input type="checkbox" id="countsToggle"> counts</label>
       <div class="scheme-readout" id="schemeReadout"></div>
     </div>
   </div>
@@ -361,6 +367,9 @@ allitToggle.addEventListener('change', render);
 const rhymeToggle = document.getElementById('rhymeToggle');
 rhymeToggle.checked = true;
 rhymeToggle.addEventListener('change', render);
+const countsToggle = document.getElementById('countsToggle');
+countsToggle.checked = false;
+countsToggle.addEventListener('change', render);
 const schemeReadout = document.getElementById('schemeReadout');
 const COLORS = 12;
 
@@ -579,9 +588,11 @@ function render(){
         h += `<span class="hseg" style="${style}">${text}</span>`;
       }
     }
-    html += (/^\s*[#([]/.test(line) ? `<span class="anno">${h}</span>` : h) + '\n';
+    const lcls = 'lmark' + (/^\s*[#([]/.test(line) ? ' anno' : '');
+    html += (line ? `<span class="${lcls}" data-l="${i}">${h}</span>` : '') + '\n';
   });
   highlight.innerHTML = html;
+  renderCounts(lines);
   renderStress(lines);
   highlight.scrollTop = editor.scrollTop;
   highlight.scrollLeft = editor.scrollLeft;
@@ -608,6 +619,21 @@ function cadenceColors(){
   return map;
 }
 
+function renderCounts(lines){
+  highlight.querySelectorAll('.sylcount').forEach(el=>el.remove());
+  if(!countsToggle.checked || !analysis || !analysis.meter) return;
+  analysis.meter.forEach(m=>{
+    if(analysis.lines[m.l] !== lines[m.l]) return;  // line being edited
+    const mark = highlight.querySelector(`.lmark[data-l="${m.l}"]`);
+    if(!mark) return;
+    const d = document.createElement('div');
+    d.className = 'sylcount';
+    d.textContent = m.syl;
+    d.style.top = (mark.offsetTop + 5) + 'px';
+    highlight.appendChild(d);
+  });
+}
+
 function renderStress(lines){
   if(!stressToggle.checked){ stresslayer.innerHTML = ''; return; }
   const byLine = {};
@@ -826,6 +852,8 @@ function paintSections(){
     });
     const near = e.rhyme.near || [];
     if(near.length) h += `<div class="res-label sub">near</div>` + chipHtml(near, 'near');
+    const multis = e.rhyme.multis || [];
+    if(multis.length) h += `<div class="res-label sub">multis</div>` + chipHtml(multis);
   }
   if(e.syn && e.syn.known && e.syn.sections.length){
     h += `<div class="res-label">synonyms</div>`;
@@ -857,7 +885,7 @@ document.getElementById('exportBtn').addEventListener('click', async ()=>{
 
   const probe = document.createElement('canvas').getContext('2d');
   probe.font = font;
-  const w = Math.ceil(Math.max(220, ...lines.map(l=>probe.measureText(l).width)) + PAD * 2);
+  const w = Math.ceil(Math.max(220, ...lines.map(l=>probe.measureText(l).width)) + PAD * 2 + (countsToggle.checked ? 28 : 0));
   const h = Math.ceil(lines.length * LH + PAD * 2 + 18);
   const canvas = document.createElement('canvas');
   canvas.width = w * S; canvas.height = h * S;
@@ -906,6 +934,17 @@ document.getElementById('exportBtn').addEventListener('click', async ()=>{
     }
     x.fillStyle = /^\s*[#([]/.test(line) ? '#6a5f52' : ink;
     x.fillText(line, PAD, y);
+    if(countsToggle.checked && fresh && analysis.meter){
+      const cm = analysis.meter.find(mm=>mm.l===i);
+      if(cm){
+        x.font = "10px 'Spline Sans Mono', monospace";
+        x.textAlign = 'right';
+        x.fillStyle = '#5d5347';
+        x.fillText(String(cm.syl), w - 10, y);
+        x.textAlign = 'left';
+        x.font = font;
+      }
+    }
     if(rhythm && fresh){
       const spans = (analysis.stress.filter(s=>s.l===i)).sort((a,b)=>a.s-b.s);
       x.font = "8px 'Spline Sans Mono', monospace";
diff --git a/tests/test_rhymes.py b/tests/test_rhymes.py
index 88208a8..98a7a8d 100644
--- a/tests/test_rhymes.py
+++ b/tests/test_rhymes.py
@@ -642,3 +642,31 @@ def test_end_dominated_vowel_families_fuse():
             "The top is not enough\n"
             "No choice, sellin' drugs")
     group_with(text, "love", "blood", "thugs", "mud", "enough", "drugs")
+
+
+def test_em_back_pocket_quintet_is_one_family():
+    # AE-AA mosaics, plain AA endings, and IH-x-AA triples all share the
+    # vowel tail — suffix fusion reads them as Em wrote them: one chain
+    text = ("got a laptop in my back pocket\n"
+            "My pen'll go off when I half-cock it\n"
+            "Got a fat knot from that rap profit\n"
+            "Made a livin' and a killin' off it\n"
+            "Ever since Bill Clinton was still in office")
+    group_with(text, "back pocket", "rap profit", "off it",
+               "still in office", "laptop", "pocket", "profit", "office")
+    assert scheme(text) == "aaaaa"
+
+
+# ------------------------------------------------------------------ multis
+
+def test_multis_generator():
+    from app import multis_for
+    ch = multis_for("charisma", set())
+    assert "little" in ch  # IH-x skeleton, not a perfect rhyme
+    el = multis_for("elevator", set())
+    assert any(" " in m for m in el)  # two-word combos exist
+
+
+def test_multis_in_lookup_response():
+    data = lookup("placement", mode="rhyme")
+    assert data["multis"] and "basement" not in data["multis"]  # perfects excluded