mirror of
https://github.com/kennethreitz/rhymepad.org.git
synced 2026-06-11 17:08:33 +00:00
Weak V-x families stop gluing: coda-nested buckets, gated fusion
The spotlight exposed forever/question/extra/sequential as one EH-x family. Three fixes: - phrases obey refrain muting (the «Forever ever» carpet) - bare V-x buckets subdivide by nesting final-coda classes, so placement (NT) keeps creation (N) but forever (.) releases sequential (L); phrases ride the main cluster - single-vowel suffix fusion requires member coda sets to nest Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -337,6 +337,19 @@ def _m2_key(seq: list[str]) -> str | None:
|
||||
return f"m2:{vowels[0]} {_coda_class(ph[vi + 1])} x"
|
||||
|
||||
|
||||
def _final_coda_tag(pl: list[str]) -> str:
|
||||
"""Coda-class string after the LAST vowel ('.' for open)."""
|
||||
last = -1
|
||||
for i, p in enumerate(pl):
|
||||
if p[-1].isdigit():
|
||||
last = i
|
||||
coda = "".join(_coda_class(DIGITS.sub("", p)) for p in pl[last + 1:])
|
||||
return coda or "."
|
||||
|
||||
|
||||
WEAK_MK = re.compile(r"^m:[A-Z]+ x$")
|
||||
|
||||
|
||||
def multi_keys(word: str) -> tuple[str, ...]:
|
||||
"""Multisyllabic keys across all candidate pronunciations, anchored at
|
||||
the last stressed vowel AND the first primary stress — KET-a-mine can
|
||||
@@ -514,6 +527,11 @@ def analyze(draft: Draft):
|
||||
"sid": sids[i], "gid": None, "slant": False,
|
||||
})
|
||||
|
||||
# words the draft leans on as refrain/filler (4+ uses) stop lighting
|
||||
# up mid-line — their line-end uses still count
|
||||
counts = Counter(t["word"].lower() for t in tokens)
|
||||
refrain = {w for w, c in counts.items() if c >= 4}
|
||||
|
||||
# phrase tokens: adjacent word pairs, so multi-word rhymes can match
|
||||
# single words (orange / door hinge). Anchored at the first word's
|
||||
# stressed vowel; competes in the multisyllabic slant pass only.
|
||||
@@ -523,6 +541,8 @@ def analyze(draft: Draft):
|
||||
line_toks[t["line"]].append(t)
|
||||
for toks in line_toks.values():
|
||||
for a, b in zip(toks, toks[1:]):
|
||||
if a["word"].lower() in refrain:
|
||||
continue # «Forever ever» carpets obey refrain muting too
|
||||
pa, pb = phones_for(a["word"]), phones_for(b["word"])
|
||||
if not (pa and pb):
|
||||
continue
|
||||
@@ -544,7 +564,8 @@ def analyze(draft: Draft):
|
||||
# a phrase touching a stopword ("were up") may still match
|
||||
# perfectly, but never competes in the vowel-only passes
|
||||
"weak": (a["word"].lower() in STOPWORDS
|
||||
or b["word"].lower() in STOPWORDS),
|
||||
or b["word"].lower() in STOPWORDS
|
||||
or b["word"].lower() in refrain),
|
||||
})
|
||||
|
||||
# mosaic triples: three-word runs whose vowel run is the rhyme —
|
||||
@@ -554,6 +575,8 @@ def analyze(draft: Draft):
|
||||
for a, b, c in zip(toks, toks[1:], toks[2:]):
|
||||
if a["word"].lower() in STOPWORDS:
|
||||
continue
|
||||
if a["word"].lower() in refrain:
|
||||
continue
|
||||
pa, pb, pc = (phones_for(a["word"]), phones_for(b["word"]),
|
||||
phones_for(c["word"]))
|
||||
if not (pa and pb and pc):
|
||||
@@ -581,11 +604,6 @@ def analyze(draft: Draft):
|
||||
"weak": False,
|
||||
})
|
||||
|
||||
# words the draft leans on as refrain/filler (4+ uses) stop lighting
|
||||
# up mid-line — their line-end uses still count
|
||||
counts = Counter(t["word"].lower() for t in tokens)
|
||||
refrain = {w for w, c in counts.items() if c >= 4}
|
||||
|
||||
# pass 1: perfect rhymes (shared rime), anywhere in a line — this is
|
||||
# what catches internal rhymes. Phrases compete too, so "stir up"
|
||||
# perfect-rhymes "syrup" even while its "up" rhymes with "cup".
|
||||
@@ -809,21 +827,67 @@ def analyze(draft: Draft):
|
||||
# biggest buckets claim first (a token may sit in several via its
|
||||
# anchors); distinctness by anchor word, so the phrase "fire burns"
|
||||
# can't pose as a different word than the "fire" it starts with
|
||||
for (sid, key), toks in sorted(by_multi.items(),
|
||||
key=lambda kv: (-len(kv[1]), kv[0][1])):
|
||||
def _flush_multi(toks, key):
|
||||
toks = [t for t in toks if id(t) not in grouped]
|
||||
if len(toks) < 2 or len({t["word"].split()[0] for t in toks}) < 2:
|
||||
continue
|
||||
return
|
||||
# an all-phrase bucket whose members mirror the same two word
|
||||
# groups is pure redundancy (oh my / go rhyme over oh+go, my+rhyme)
|
||||
halves = {t.get("halves") for t in toks}
|
||||
if (len(halves) == 1
|
||||
and None not in halves
|
||||
and None not in next(iter(halves))):
|
||||
continue
|
||||
return
|
||||
raw_groups.append({"toks": toks, "slant": True, "key": key})
|
||||
grouped.update(id(t) for t in toks)
|
||||
|
||||
def _word_tag(t):
|
||||
ph = phones_for(t["word"])
|
||||
return _final_coda_tag(ph.split()) if ph else "."
|
||||
|
||||
def _tags_ok(ta, tb):
|
||||
if ta == tb:
|
||||
return True
|
||||
if ta == "." or tb == ".":
|
||||
return False
|
||||
return (ta.startswith(tb) or tb.startswith(ta)
|
||||
or ta.endswith(tb) or tb.endswith(ta))
|
||||
|
||||
for (sid, key), toks in sorted(by_multi.items(),
|
||||
key=lambda kv: (-len(kv[1]), kv[0][1])):
|
||||
if not WEAK_MK.match(key):
|
||||
_flush_multi(toks, key)
|
||||
continue
|
||||
# a bare V-x signature is too weak on its own: subdivide the
|
||||
# bucket by nesting final-coda classes, so placement (NT) keeps
|
||||
# creation (N) but forever (.) lets go of sequential (L)
|
||||
words = [t for t in toks if " " not in t["word"]]
|
||||
phs = [t for t in toks if " " in t["word"]]
|
||||
tags = [_word_tag(t) for t in words]
|
||||
par = list(range(len(words)))
|
||||
|
||||
def _f(i):
|
||||
while par[i] != i:
|
||||
par[i] = par[par[i]]
|
||||
i = par[i]
|
||||
return i
|
||||
|
||||
for i in range(len(words)):
|
||||
for j in range(i + 1, len(words)):
|
||||
if _tags_ok(tags[i], tags[j]):
|
||||
par[_f(i)] = _f(j)
|
||||
clus = defaultdict(list)
|
||||
for i, t in enumerate(words):
|
||||
clus[_f(i)].append(t)
|
||||
subsets = sorted(clus.values(), key=len, reverse=True)
|
||||
if phs:
|
||||
if subsets:
|
||||
subsets[0].extend(phs) # phrases ride the main cluster
|
||||
else:
|
||||
subsets = [phs]
|
||||
for sub in subsets:
|
||||
_flush_multi(sub, key)
|
||||
|
||||
# pass 4: consonance-aware slant anywhere in a line — last stressed
|
||||
# vowel + first coda consonant, so bliss / whisps / exist (IH S) group
|
||||
# even though their full codas differ
|
||||
@@ -915,11 +979,34 @@ def analyze(draft: Draft):
|
||||
return len(short) <= len(long) and long[len(long) - len(short):] == short
|
||||
|
||||
def _sig(key):
|
||||
vs = key[2:].split()
|
||||
vs = key[2:].split("|", 1)[0].split()
|
||||
while vs and vs[-1] == "x":
|
||||
vs.pop() # trailing schwas fall off the beat on both sides
|
||||
return vs
|
||||
|
||||
def _gtags(gi):
|
||||
out = set()
|
||||
for t in raw_groups[gi]["toks"]:
|
||||
if " " in t["word"]:
|
||||
continue
|
||||
ph = phones_for(t["word"])
|
||||
if ph:
|
||||
out.add(_final_coda_tag(ph.split()))
|
||||
return out
|
||||
|
||||
def _tags_ok2(ta, tb):
|
||||
if ta == tb:
|
||||
return True
|
||||
if ta == "." or tb == ".":
|
||||
return False
|
||||
return (ta.startswith(tb) or tb.startswith(ta)
|
||||
or ta.endswith(tb) or tb.endswith(ta))
|
||||
|
||||
def _sets_nest(A, B):
|
||||
if not A or not B:
|
||||
return True # phrase-only family: no coda evidence to refuse
|
||||
return any(_tags_ok2(x, y) for x in A for y in B)
|
||||
|
||||
for ai in range(len(raw_groups)):
|
||||
if not mkeys[ai]:
|
||||
continue
|
||||
@@ -935,8 +1022,13 @@ def analyze(draft: Draft):
|
||||
# equal keys fuse; so do END-ALIGNED containments — a family
|
||||
# rhyming on AA-x is the tail of one rhyming on AE-AA-x
|
||||
# (back pocket / rap profit / office), the longer just
|
||||
# carries lead syllables
|
||||
# carries lead syllables. But when BOTH signatures are a
|
||||
# single vowel, the final codas must nest too — forever (.)
|
||||
# and sequential (L) share EH-x and still aren't one family
|
||||
if va == vb or _tail_of(va, vb) or _tail_of(vb, va):
|
||||
if (len(va) == 1 and len(vb) == 1
|
||||
and not _sets_nest(_gtags(ai), _gtags(bi))):
|
||||
continue
|
||||
mparent[mfind(ai)] = mfind(bi)
|
||||
|
||||
mclusters = defaultdict(list)
|
||||
|
||||
@@ -670,3 +670,19 @@ def test_multis_generator():
|
||||
def test_multis_in_lookup_response():
|
||||
data = lookup("placement", mode="rhyme")
|
||||
assert data["multis"] and "basement" not in data["multis"] # perfects excluded
|
||||
|
||||
|
||||
def test_forever_does_not_glue_to_sequential():
|
||||
# both carry EH-x, but forever ends open (.) and sequential closes
|
||||
# on L — and «forever (forever» phrase carpets obey refrain muting
|
||||
text = ("Diamonds are forever (forever, forever)\n"
|
||||
"Diamonds are forever (forever, forever, forever)\n"
|
||||
"Forever ever? Forever ever? Ever, ever?\n"
|
||||
"this is a question of writing style\n"
|
||||
"going the extra mile\n"
|
||||
"quintessential\n"
|
||||
"sequential")
|
||||
seq = group_with(text, "quintessential", "sequential")
|
||||
assert "forever" not in seq and "ever" not in seq
|
||||
assert "question" not in seq and "extra" not in seq
|
||||
group_with(text, "style", "mile")
|
||||
|
||||
Reference in New Issue
Block a user