From 1f888e2b21aad75c718c4ef0ce769b83bbe2b939 Mon Sep 17 00:00:00 2001
From: Kenneth Reitz <me@kennethreitz.org>
Date: Fri, 27 Mar 2026 12:10:54 -0400
Subject: [PATCH 1/3] Vocal/formant synth with choir preset
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Formant synthesis: glottal buzz source through parallel bandpass
filters at vowel resonance frequencies. Supports 5 vowels (A E I O U)
with consonant onsets (plosives, sibilants, nasals, fricatives,
liquids, aspirates, glides). Per-note lyrics via Part.add(lyric=).

Best for choir pads — vowel sounds with cathedral reverb and detune.
Consonant synthesis is rudimentary (noise bursts, not real speech).

Presets: vocal (solo), choir (detuned ensemble).

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 pytheory/play.py   | 166 ++++++++++++++++++++++++++++++++++++++++++++-
 pytheory/rhythm.py |  15 +++-
 2 files changed, 176 insertions(+), 5 deletions(-)

diff --git a/pytheory/play.py b/pytheory/play.py
index 87876b7..0a53686 100644
--- a/pytheory/play.py
+++ b/pytheory/play.py
@@ -909,6 +909,160 @@ def saxophone_wave(hz, peak=SAMPLE_PEAK, n_samples=SAMPLE_RATE):
     return (peak * wave).astype(numpy.int16)
 
 
+def vocal_wave(hz, peak=SAMPLE_PEAK, n_samples=SAMPLE_RATE, lyric="ah"):
+    """Vocal/formant synthesis — sings vowel sounds at a given pitch.
+
+    Models the human voice as:
+    1. Glottal buzz — sawtooth-like pulse train (vocal cords vibrating)
+    2. Formant filters — resonant peaks that shape the spectrum into
+       vowel sounds. Each vowel has 3-5 characteristic frequencies.
+    3. Breathiness — a small amount of noise mixed in
+
+    The ``lyric`` parameter controls which vowel formants are used.
+    Consonants are approximated with noise bursts and filter sweeps.
+
+    Vowel formant frequencies (Hz) for a male voice:
+        A (father): F1=800, F2=1200, F3=2500
+        E (bed):    F1=600, F2=1800, F3=2500
+        I (see):    F1=300, F2=2200, F3=3000
+        O (go):     F1=500, F2=1000, F3=2500
+        U (blue):   F1=350, F2=700,  F3=2500
+    """
+    import scipy.signal as _sig
+
+    # Vowel formant table: (F1, F2, F3, bandwidth1, bandwidth2, bandwidth3)
+    # Wide bandwidths for audible character
+    FORMANTS = {
+        'a': (800, 1200, 2500, 200, 200, 250),
+        'e': (600, 1800, 2500, 150, 200, 250),
+        'i': (300, 2200, 3000, 120, 200, 250),
+        'o': (500, 1000, 2500, 150, 180, 250),
+        'u': (350, 700, 2500, 100, 150, 200),
+    }
+
+    rng = numpy.random.default_rng(int(hz * 100 + len(lyric) * 7) % 2**31)
+
+    # Parse the lyric into a vowel sequence
+    vowels_in_lyric = [c.lower() for c in lyric if c.lower() in FORMANTS]
+    if not vowels_in_lyric:
+        vowels_in_lyric = ['a']  # default
+
+    # Glottal source — sawtooth-like pulse (vocal cord vibration)
+    # Real glottal pulse has sharper closing phase than opening
+    t = numpy.arange(n_samples, dtype=numpy.float64) / SAMPLE_RATE
+    # Slight vibrato (natural vocal wobble)
+    vib = hz * 0.0008 * numpy.sin(2 * numpy.pi * 5.5 * t)
+    phase = numpy.cumsum(2 * numpy.pi * (hz + vib) / SAMPLE_RATE)
+    # Glottal pulse: modified sawtooth with sharper falling edge
+    glottal = numpy.sin(phase) * 0.5 + numpy.sin(phase * 2) * 0.3 + numpy.sin(phase * 3) * 0.15
+
+    # Breathiness
+    breath = rng.normal(0, 0.05, n_samples)
+    source = glottal + breath
+
+    # Apply formant filters — one per vowel in the lyric
+    # If multiple vowels, crossfade between them over the note duration
+    n_vowels = len(vowels_in_lyric)
+    samples_per_vowel = n_samples // max(1, n_vowels)
+
+    out = numpy.zeros(n_samples, dtype=numpy.float64)
+
+    for vi, vowel in enumerate(vowels_in_lyric):
+        f1, f2, f3, bw1, bw2, bw3 = FORMANTS[vowel]
+        start = vi * samples_per_vowel
+        end = min(start + samples_per_vowel, n_samples)
+        if vi == n_vowels - 1:
+            end = n_samples  # last vowel gets remaining samples
+
+        segment = source[start:end].copy()
+
+        # Three formant bandpass filters — parallel, then summed
+        # Each formant is an independent resonant peak
+        formant_out = numpy.zeros_like(segment)
+        for fc, bw, gain in [(f1, bw1, 1.0), (f2, bw2, 0.8), (f3, bw3, 0.5)]:
+            lo = max(20, fc - bw)
+            hi = min(SAMPLE_RATE // 2 - 1, fc + bw)
+            if lo < hi:
+                bp, ap = _sig.butter(3, [lo, hi], btype='band', fs=SAMPLE_RATE)
+                formant_out += _sig.lfilter(bp, ap, segment).astype(numpy.float64) * gain
+        # Almost entirely formant-shaped — very little raw source
+        segment = formant_out * 0.9 + segment * 0.1
+
+        # Crossfade at vowel boundaries (10ms)
+        fade_len = min(int(SAMPLE_RATE * 0.01), len(segment) // 4)
+        if vi > 0 and fade_len > 0:
+            fade_in = numpy.linspace(0, 1, fade_len)
+            segment[:fade_len] *= fade_in
+        if vi < n_vowels - 1 and fade_len > 0:
+            fade_out = numpy.linspace(1, 0, fade_len)
+            segment[-fade_len:] *= fade_out
+
+        out[start:end] += segment[:end - start]
+
+    # Check for consonant-like onsets
+    lyric_lower = lyric.lower()
+    has_consonant = lyric_lower and lyric_lower[0] not in 'aeiou'
+
+    if has_consonant:
+        c = lyric_lower[0]
+        cons_len = min(int(SAMPLE_RATE * 0.03), n_samples)
+        if c in 'tdkpb':
+            # Plosive — brief noise burst
+            plosive = rng.uniform(-0.4, 0.4, cons_len)
+            plosive *= numpy.exp(-numpy.linspace(0, 15, cons_len))
+            out[:cons_len] = plosive + out[:cons_len] * 0.3
+        elif c in 'sz':
+            # Sibilant — filtered noise
+            sib = rng.uniform(-0.3, 0.3, cons_len)
+            if cons_len > 20:
+                bl, al = _sig.butter(2, [3000, min(8000, SAMPLE_RATE // 2 - 1)],
+                                     btype='band', fs=SAMPLE_RATE)
+                sib = _sig.lfilter(bl, al, numpy.pad(sib, (0, max(0, n_samples - cons_len))))[:cons_len]
+            sib *= numpy.exp(-numpy.linspace(0, 8, cons_len))
+            out[:cons_len] = sib * 0.5 + out[:cons_len] * 0.5
+        elif c in 'mn':
+            # Nasal — low formant
+            nasal_len = min(int(SAMPLE_RATE * 0.05), n_samples)
+            nasal = numpy.sin(2 * numpy.pi * 250 * t[:nasal_len]) * 0.3
+            nasal *= numpy.exp(-numpy.linspace(0, 5, nasal_len))
+            out[:nasal_len] = nasal + out[:nasal_len] * 0.5
+        elif c in 'fv':
+            # Fricative
+            fric = rng.uniform(-0.2, 0.2, cons_len)
+            fric *= numpy.exp(-numpy.linspace(0, 10, cons_len))
+            out[:cons_len] = fric * 0.4 + out[:cons_len] * 0.6
+        elif c in 'lr':
+            # Liquid — brief glide
+            glide_len = min(int(SAMPLE_RATE * 0.04), n_samples)
+            glide_t = numpy.arange(glide_len, dtype=numpy.float64) / SAMPLE_RATE
+            glide_hz = hz * 0.8 + hz * 0.2 * numpy.linspace(0, 1, glide_len)
+            glide = numpy.sin(numpy.cumsum(2 * numpy.pi * glide_hz / SAMPLE_RATE)) * 0.3
+            out[:glide_len] = glide + out[:glide_len] * 0.7
+        elif c == 'h':
+            # Aspirate — breathy onset
+            h_len = min(int(SAMPLE_RATE * 0.04), n_samples)
+            aspirate = rng.uniform(-0.3, 0.3, h_len)
+            aspirate *= numpy.exp(-numpy.linspace(0, 6, h_len))
+            out[:h_len] = aspirate * 0.5 + out[:h_len] * 0.5
+        elif c == 'w':
+            # Glide from U formant
+            w_len = min(int(SAMPLE_RATE * 0.05), n_samples)
+            w_t = numpy.arange(w_len, dtype=numpy.float64) / SAMPLE_RATE
+            w_source = numpy.sin(numpy.cumsum(2 * numpy.pi * hz / SAMPLE_RATE * numpy.ones(w_len)))
+            if w_len > 20:
+                bp, ap = _sig.butter(2, [max(20, 300), min(800, SAMPLE_RATE // 2 - 1)],
+                                     btype='band', fs=SAMPLE_RATE)
+                w_source = _sig.lfilter(bp, ap, w_source)
+            w_source *= numpy.linspace(0.5, 0, w_len) * 0.4
+            out[:w_len] = w_source + out[:w_len] * 0.6
+
+    mx = numpy.abs(out).max()
+    if mx > 0:
+        out /= mx
+
+    return (peak * out).astype(numpy.int16)
+
+
 def granular_wave(hz, peak=SAMPLE_PEAK, n_samples=SAMPLE_RATE,
                   grain_size=0.04, density=50, scatter=0.5,
                   pitch_var=12, source="saw"):
@@ -1290,6 +1444,7 @@ class Synth(Enum):
     TIMPANI = "timpani_synth"
     SAXOPHONE = "saxophone_synth"
     GRANULAR = "granular_synth"
+    VOCAL = "vocal_synth"
     ACOUSTIC_GUITAR = "acoustic_guitar_synth"
     SITAR = "sitar_synth"
     ELECTRIC_GUITAR = "electric_guitar_synth"
@@ -1312,7 +1467,7 @@ _SYNTH_FUNCTIONS = {
     "harpsichord_synth": harpsichord_wave, "cello_synth": cello_wave,
     "harp_synth": harp_wave, "upright_bass_synth": upright_bass_wave,
     "timpani_synth": timpani_wave, "saxophone_synth": saxophone_wave,
-    "granular_synth": granular_wave,
+    "granular_synth": granular_wave, "vocal_synth": vocal_wave,
     "acoustic_guitar_synth": acoustic_guitar_wave,
     "sitar_synth": sitar_wave, "electric_guitar_synth": electric_guitar_wave,
 }
@@ -3528,8 +3683,13 @@ def _render_notes_to_buf(notes, buf, samples_per_beat, total_samples,
                         bent = src_f[idx] * (1 - frac) + src_f[numpy.minimum(idx + 1, src_len - 1)] * frac
                         waves.append((bent * SAMPLE_PEAK).astype(numpy.int16))
                 else:
-                    # Render oscillators (pass synth_kwargs for FM etc.)
-                    waves = [synth_fn(hz, n_samples=n_samples, **_skw)
+                    # Per-note kwargs (e.g. lyric for vocal synth)
+                    note_skw = dict(_skw)
+                    note_lyric = getattr(note, 'lyric', '')
+                    if note_lyric:
+                        note_skw['lyric'] = note_lyric
+                    # Render oscillators
+                    waves = [synth_fn(hz, n_samples=n_samples, **note_skw)
                              for hz in pitches]
                 # Sub-oscillator: octave-below sine
                 if sub_osc > 0:
diff --git a/pytheory/rhythm.py b/pytheory/rhythm.py
index 6db5850..84811b4 100644
--- a/pytheory/rhythm.py
+++ b/pytheory/rhythm.py
@@ -246,6 +246,16 @@ INSTRUMENTS = {
         "reverb": 0.4, "reverb_type": "cathedral",
         "analog": 0.3,
     },
+    "vocal": {
+        "synth": "vocal_synth", "envelope": "strings",
+        "reverb": 0.3, "reverb_type": "hall",
+        "humanize": 0.15,
+    },
+    "choir": {
+        "synth": "vocal_synth", "envelope": "pad",
+        "detune": 8, "spread": 0.4,
+        "reverb": 0.45, "reverb_type": "cathedral",
+    },
     "granular_texture": {
         "synth": "granular_synth", "envelope": "none",
         "reverb": 0.5, "reverb_type": "taj_mahal",
@@ -367,6 +377,7 @@ class Note:
     velocity: int = 100
     bend: float = 0.0
     bend_type: str = "smooth"  # "smooth" (log), "linear", "late"
+    lyric: str = ""  # syllable for vocal synth
 
     @property
     def beats(self) -> float:
@@ -2095,7 +2106,7 @@ class Part:
         self._automation: list[tuple[float, dict]] = []  # (beat, {param: value})
 
     def add(self, tone_or_string, duration=Duration.QUARTER, *, velocity: int = 100,
-            bend: float = 0.0, bend_type: str = "smooth") -> "Part":
+            bend: float = 0.0, bend_type: str = "smooth", lyric: str = "") -> "Part":
         """Add a note. Accepts Tone/Chord objects or note strings like ``"E5"``.
 
         Duration can be a ``Duration`` enum or a raw float (beats).
@@ -2113,7 +2124,7 @@ class Part:
             duration = _RawDuration(duration)
         self.notes.append(Note(tone=tone_or_string, duration=duration,
                                velocity=velocity, bend=bend,
-                               bend_type=bend_type))
+                               bend_type=bend_type, lyric=lyric))
         return self
 
     def set(self, **params) -> "Part":

From 6a836dd891b10926cb46e28b3da76557c8561371 Mon Sep 17 00:00:00 2001
From: Kenneth Reitz <me@kennethreitz.org>
Date: Fri, 27 Mar 2026 12:13:50 -0400
Subject: [PATCH 2/3] Overhaul vocal synth: LF glottal model, 5 formants,
 jitter/shimmer

- LF glottal pulse: asymmetric open/close phase (not sines)
- 5 parallel formant filters per vowel (Peterson & Barney data)
- Jitter (0.3% pitch irregularity) + shimmer (2% amplitude)
- Much more voice-like than previous version
- Consonant onsets preserved

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 pytheory/play.py | 211 ++++++++++++++++++++++-------------------------
 1 file changed, 97 insertions(+), 114 deletions(-)

diff --git a/pytheory/play.py b/pytheory/play.py
index 0a53686..c2c1a57 100644
--- a/pytheory/play.py
+++ b/pytheory/play.py
@@ -912,149 +912,132 @@ def saxophone_wave(hz, peak=SAMPLE_PEAK, n_samples=SAMPLE_RATE):
 def vocal_wave(hz, peak=SAMPLE_PEAK, n_samples=SAMPLE_RATE, lyric="ah"):
     """Vocal/formant synthesis — sings vowel sounds at a given pitch.
 
-    Models the human voice as:
-    1. Glottal buzz — sawtooth-like pulse train (vocal cords vibrating)
-    2. Formant filters — resonant peaks that shape the spectrum into
-       vowel sounds. Each vowel has 3-5 characteristic frequencies.
-    3. Breathiness — a small amount of noise mixed in
-
-    The ``lyric`` parameter controls which vowel formants are used.
-    Consonants are approximated with noise bursts and filter sweeps.
-
-    Vowel formant frequencies (Hz) for a male voice:
-        A (father): F1=800, F2=1200, F3=2500
-        E (bed):    F1=600, F2=1800, F3=2500
-        I (see):    F1=300, F2=2200, F3=3000
-        O (go):     F1=500, F2=1000, F3=2500
-        U (blue):   F1=350, F2=700,  F3=2500
+    Models the human voice with:
+    1. LF glottal model — asymmetric pulse with sharp closure (not just sines)
+    2. 5 parallel resonant formant filters (real voice has 5 formant peaks)
+    3. Jitter + shimmer (natural pitch/amplitude irregularity)
+    4. Aspiration noise mixed with the glottal source
+    5. Consonant onsets (plosives, sibilants, nasals, etc.)
     """
     import scipy.signal as _sig
 
-    # Vowel formant table: (F1, F2, F3, bandwidth1, bandwidth2, bandwidth3)
-    # Wide bandwidths for audible character
+    # 5-formant table: (F1, F2, F3, F4, F5) frequencies and bandwidths
+    # Based on Peterson & Barney (1952) measurements, male voice
     FORMANTS = {
-        'a': (800, 1200, 2500, 200, 200, 250),
-        'e': (600, 1800, 2500, 150, 200, 250),
-        'i': (300, 2200, 3000, 120, 200, 250),
-        'o': (500, 1000, 2500, 150, 180, 250),
-        'u': (350, 700, 2500, 100, 150, 200),
+        'a': [(800, 130), (1200, 100), (2500, 140), (3300, 250), (3750, 300)],
+        'e': [(530, 80),  (1850, 100), (2500, 130), (3300, 250), (3750, 300)],
+        'i': [(280, 60),  (2250, 100), (2900, 120), (3350, 250), (3750, 300)],
+        'o': [(500, 100), (1000, 80),  (2500, 140), (3300, 250), (3750, 300)],
+        'u': ((325, 70),  (700, 60),   (2530, 140), (3300, 250), (3750, 300)),
     }
+    # Formant gains (relative amplitude per formant)
+    FGAINS = [1.0, 0.8, 0.5, 0.25, 0.15]
 
     rng = numpy.random.default_rng(int(hz * 100 + len(lyric) * 7) % 2**31)
+    t = numpy.arange(n_samples, dtype=numpy.float64) / SAMPLE_RATE
 
-    # Parse the lyric into a vowel sequence
+    # Parse vowels from lyric
     vowels_in_lyric = [c.lower() for c in lyric if c.lower() in FORMANTS]
     if not vowels_in_lyric:
-        vowels_in_lyric = ['a']  # default
+        vowels_in_lyric = ['a']
 
-    # Glottal source — sawtooth-like pulse (vocal cord vibration)
-    # Real glottal pulse has sharper closing phase than opening
-    t = numpy.arange(n_samples, dtype=numpy.float64) / SAMPLE_RATE
-    # Slight vibrato (natural vocal wobble)
-    vib = hz * 0.0008 * numpy.sin(2 * numpy.pi * 5.5 * t)
-    phase = numpy.cumsum(2 * numpy.pi * (hz + vib) / SAMPLE_RATE)
-    # Glottal pulse: modified sawtooth with sharper falling edge
-    glottal = numpy.sin(phase) * 0.5 + numpy.sin(phase * 2) * 0.3 + numpy.sin(phase * 3) * 0.15
+    # ── Glottal source: LF model approximation ──
+    # Asymmetric pulse: slow open phase, sharp closure, then closed phase.
+    # Much more "voice-like" than a sine or sawtooth.
+    # Jitter (pitch irregularity) + shimmer (amplitude irregularity)
+    jitter = rng.normal(0, hz * 0.003, n_samples)  # ~0.3% pitch jitter
+    shimmer = 1.0 + rng.normal(0, 0.02, n_samples)  # ~2% amp shimmer
+    # Vibrato
+    vib = hz * 0.001 * numpy.sin(2 * numpy.pi * 5.5 * t)
+    inst_freq = hz + vib + jitter
+    phase = numpy.cumsum(2 * numpy.pi * inst_freq / SAMPLE_RATE)
+    # LF glottal shape: sharper falling edge via phase shaping
+    saw = (phase / (2 * numpy.pi)) % 1.0  # 0 to 1 sawtooth
+    # Asymmetric: slow rise (60%), fast fall (40%)
+    glottal = numpy.where(saw < 0.6,
+                          numpy.sin(numpy.pi * saw / 0.6),   # smooth rise
+                          -numpy.sin(numpy.pi * (saw - 0.6) / 0.4) * 0.8)  # sharp fall
+    glottal *= shimmer
 
-    # Breathiness
-    breath = rng.normal(0, 0.05, n_samples)
-    source = glottal + breath
+    # Aspiration noise (breathiness)
+    breath = rng.normal(0, 0.08, n_samples)
+    source = glottal * 0.85 + breath * 0.15
 
-    # Apply formant filters — one per vowel in the lyric
-    # If multiple vowels, crossfade between them over the note duration
+    # ── Formant filtering ──
     n_vowels = len(vowels_in_lyric)
-    samples_per_vowel = n_samples // max(1, n_vowels)
-
     out = numpy.zeros(n_samples, dtype=numpy.float64)
 
-    for vi, vowel in enumerate(vowels_in_lyric):
-        f1, f2, f3, bw1, bw2, bw3 = FORMANTS[vowel]
-        start = vi * samples_per_vowel
-        end = min(start + samples_per_vowel, n_samples)
-        if vi == n_vowels - 1:
-            end = n_samples  # last vowel gets remaining samples
-
-        segment = source[start:end].copy()
-
-        # Three formant bandpass filters — parallel, then summed
-        # Each formant is an independent resonant peak
-        formant_out = numpy.zeros_like(segment)
-        for fc, bw, gain in [(f1, bw1, 1.0), (f2, bw2, 0.8), (f3, bw3, 0.5)]:
+    if n_vowels == 1:
+        # Single vowel — filter the whole thing
+        formants = FORMANTS[vowels_in_lyric[0]]
+        for (fc, bw), gain in zip(formants, FGAINS):
             lo = max(20, fc - bw)
             hi = min(SAMPLE_RATE // 2 - 1, fc + bw)
             if lo < hi:
-                bp, ap = _sig.butter(3, [lo, hi], btype='band', fs=SAMPLE_RATE)
-                formant_out += _sig.lfilter(bp, ap, segment).astype(numpy.float64) * gain
-        # Almost entirely formant-shaped — very little raw source
-        segment = formant_out * 0.9 + segment * 0.1
+                bp, ap = _sig.butter(2, [lo, hi], btype='band', fs=SAMPLE_RATE)
+                out += _sig.lfilter(bp, ap, source).astype(numpy.float64) * gain
+    else:
+        # Multiple vowels — crossfade formants
+        samples_per_vowel = n_samples // n_vowels
+        for vi, vowel in enumerate(vowels_in_lyric):
+            formants = FORMANTS[vowel]
+            start = vi * samples_per_vowel
+            end = n_samples if vi == n_vowels - 1 else start + samples_per_vowel
+            seg = source[start:end].copy()
+            seg_out = numpy.zeros_like(seg)
+            for (fc, bw), gain in zip(formants, FGAINS):
+                lo = max(20, fc - bw)
+                hi = min(SAMPLE_RATE // 2 - 1, fc + bw)
+                if lo < hi:
+                    bp, ap = _sig.butter(2, [lo, hi], btype='band', fs=SAMPLE_RATE)
+                    seg_out += _sig.lfilter(bp, ap, seg).astype(numpy.float64) * gain
+            # Crossfade
+            fade = min(int(SAMPLE_RATE * 0.02), len(seg_out) // 4)
+            if vi > 0 and fade > 0:
+                seg_out[:fade] *= numpy.linspace(0, 1, fade)
+            if vi < n_vowels - 1 and fade > 0:
+                seg_out[-fade:] *= numpy.linspace(1, 0, fade)
+            out[start:end] += seg_out[:end - start]
 
-        # Crossfade at vowel boundaries (10ms)
-        fade_len = min(int(SAMPLE_RATE * 0.01), len(segment) // 4)
-        if vi > 0 and fade_len > 0:
-            fade_in = numpy.linspace(0, 1, fade_len)
-            segment[:fade_len] *= fade_in
-        if vi < n_vowels - 1 and fade_len > 0:
-            fade_out = numpy.linspace(1, 0, fade_len)
-            segment[-fade_len:] *= fade_out
-
-        out[start:end] += segment[:end - start]
-
-    # Check for consonant-like onsets
+    # ── Consonant onsets ──
     lyric_lower = lyric.lower()
-    has_consonant = lyric_lower and lyric_lower[0] not in 'aeiou'
-
-    if has_consonant:
+    if lyric_lower and lyric_lower[0] not in 'aeiou':
         c = lyric_lower[0]
-        cons_len = min(int(SAMPLE_RATE * 0.03), n_samples)
+        cl = min(int(SAMPLE_RATE * 0.035), n_samples)
         if c in 'tdkpb':
-            # Plosive — brief noise burst
-            plosive = rng.uniform(-0.4, 0.4, cons_len)
-            plosive *= numpy.exp(-numpy.linspace(0, 15, cons_len))
-            out[:cons_len] = plosive + out[:cons_len] * 0.3
+            burst = rng.uniform(-0.5, 0.5, cl) * numpy.exp(-numpy.linspace(0, 18, cl))
+            out[:cl] = burst + out[:cl] * 0.2
         elif c in 'sz':
-            # Sibilant — filtered noise
-            sib = rng.uniform(-0.3, 0.3, cons_len)
-            if cons_len > 20:
-                bl, al = _sig.butter(2, [3000, min(8000, SAMPLE_RATE // 2 - 1)],
-                                     btype='band', fs=SAMPLE_RATE)
-                sib = _sig.lfilter(bl, al, numpy.pad(sib, (0, max(0, n_samples - cons_len))))[:cons_len]
-            sib *= numpy.exp(-numpy.linspace(0, 8, cons_len))
-            out[:cons_len] = sib * 0.5 + out[:cons_len] * 0.5
+            sib = rng.uniform(-0.4, 0.4, cl)
+            if cl > 20:
+                bl, al = _sig.butter(2, [3000, min(8000, SAMPLE_RATE//2-1)], btype='band', fs=SAMPLE_RATE)
+                sib = _sig.lfilter(bl, al, numpy.pad(sib, (0, max(0, n_samples-cl))))[:cl]
+            sib *= numpy.exp(-numpy.linspace(0, 10, cl))
+            out[:cl] = sib * 0.6 + out[:cl] * 0.4
         elif c in 'mn':
-            # Nasal — low formant
-            nasal_len = min(int(SAMPLE_RATE * 0.05), n_samples)
-            nasal = numpy.sin(2 * numpy.pi * 250 * t[:nasal_len]) * 0.3
-            nasal *= numpy.exp(-numpy.linspace(0, 5, nasal_len))
-            out[:nasal_len] = nasal + out[:nasal_len] * 0.5
+            nl = min(int(SAMPLE_RATE * 0.06), n_samples)
+            nasal = numpy.sin(2*numpy.pi*250*t[:nl]) * 0.4 * numpy.exp(-numpy.linspace(0, 4, nl))
+            out[:nl] = nasal + out[:nl] * 0.4
         elif c in 'fv':
-            # Fricative
-            fric = rng.uniform(-0.2, 0.2, cons_len)
-            fric *= numpy.exp(-numpy.linspace(0, 10, cons_len))
-            out[:cons_len] = fric * 0.4 + out[:cons_len] * 0.6
+            fric = rng.uniform(-0.25, 0.25, cl) * numpy.exp(-numpy.linspace(0, 12, cl))
+            out[:cl] = fric * 0.5 + out[:cl] * 0.5
         elif c in 'lr':
-            # Liquid — brief glide
-            glide_len = min(int(SAMPLE_RATE * 0.04), n_samples)
-            glide_t = numpy.arange(glide_len, dtype=numpy.float64) / SAMPLE_RATE
-            glide_hz = hz * 0.8 + hz * 0.2 * numpy.linspace(0, 1, glide_len)
-            glide = numpy.sin(numpy.cumsum(2 * numpy.pi * glide_hz / SAMPLE_RATE)) * 0.3
-            out[:glide_len] = glide + out[:glide_len] * 0.7
+            gl = min(int(SAMPLE_RATE * 0.05), n_samples)
+            ghz = hz * 0.7 + hz * 0.3 * numpy.linspace(0, 1, gl)
+            glide = numpy.sin(numpy.cumsum(2*numpy.pi*ghz/SAMPLE_RATE)) * 0.35
+            out[:gl] = glide + out[:gl] * 0.65
         elif c == 'h':
-            # Aspirate — breathy onset
-            h_len = min(int(SAMPLE_RATE * 0.04), n_samples)
-            aspirate = rng.uniform(-0.3, 0.3, h_len)
-            aspirate *= numpy.exp(-numpy.linspace(0, 6, h_len))
-            out[:h_len] = aspirate * 0.5 + out[:h_len] * 0.5
+            hl = min(int(SAMPLE_RATE * 0.05), n_samples)
+            asp = rng.uniform(-0.4, 0.4, hl) * numpy.exp(-numpy.linspace(0, 5, hl))
+            out[:hl] = asp * 0.6 + out[:hl] * 0.4
         elif c == 'w':
-            # Glide from U formant
-            w_len = min(int(SAMPLE_RATE * 0.05), n_samples)
-            w_t = numpy.arange(w_len, dtype=numpy.float64) / SAMPLE_RATE
-            w_source = numpy.sin(numpy.cumsum(2 * numpy.pi * hz / SAMPLE_RATE * numpy.ones(w_len)))
-            if w_len > 20:
-                bp, ap = _sig.butter(2, [max(20, 300), min(800, SAMPLE_RATE // 2 - 1)],
-                                     btype='band', fs=SAMPLE_RATE)
-                w_source = _sig.lfilter(bp, ap, w_source)
-            w_source *= numpy.linspace(0.5, 0, w_len) * 0.4
-            out[:w_len] = w_source + out[:w_len] * 0.6
+            wl = min(int(SAMPLE_RATE * 0.06), n_samples)
+            ws = numpy.sin(numpy.cumsum(2*numpy.pi*hz/SAMPLE_RATE*numpy.ones(wl)))
+            if wl > 20:
+                bp, ap = _sig.butter(2, [max(20,300), min(800, SAMPLE_RATE//2-1)], btype='band', fs=SAMPLE_RATE)
+                ws = _sig.lfilter(bp, ap, ws)
+            ws *= numpy.linspace(0.5, 0, wl)
+            out[:wl] = ws * 0.4 + out[:wl] * 0.6
 
     mx = numpy.abs(out).max()
     if mx > 0:

From 751d5a49b85dd9bfcf3fdaa99297c348b8ae80bc Mon Sep 17 00:00:00 2001
From: Kenneth Reitz <me@kennethreitz.org>
Date: Fri, 27 Mar 2026 12:17:12 -0400
Subject: [PATCH 3/3] Cleaner vocal synth: less static, click-free note
 transitions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Jitter reduced (0.3% → 0.1%), shimmer reduced (2% → 0.8%)
- Breath noise halved (0.08 → 0.04), mix 85/15 → 92/8
- 10ms fade in/out on every vocal note prevents clicks
- Smoother syllable transitions

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 pytheory/play.py | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/pytheory/play.py b/pytheory/play.py
index c2c1a57..4e90b4c 100644
--- a/pytheory/play.py
+++ b/pytheory/play.py
@@ -945,8 +945,8 @@ def vocal_wave(hz, peak=SAMPLE_PEAK, n_samples=SAMPLE_RATE, lyric="ah"):
     # Asymmetric pulse: slow open phase, sharp closure, then closed phase.
     # Much more "voice-like" than a sine or sawtooth.
     # Jitter (pitch irregularity) + shimmer (amplitude irregularity)
-    jitter = rng.normal(0, hz * 0.003, n_samples)  # ~0.3% pitch jitter
-    shimmer = 1.0 + rng.normal(0, 0.02, n_samples)  # ~2% amp shimmer
+    jitter = rng.normal(0, hz * 0.001, n_samples)  # ~0.1% pitch jitter
+    shimmer = 1.0 + rng.normal(0, 0.008, n_samples)  # ~0.8% amp shimmer
     # Vibrato
     vib = hz * 0.001 * numpy.sin(2 * numpy.pi * 5.5 * t)
     inst_freq = hz + vib + jitter
@@ -959,9 +959,9 @@ def vocal_wave(hz, peak=SAMPLE_PEAK, n_samples=SAMPLE_RATE, lyric="ah"):
                           -numpy.sin(numpy.pi * (saw - 0.6) / 0.4) * 0.8)  # sharp fall
     glottal *= shimmer
 
-    # Aspiration noise (breathiness)
-    breath = rng.normal(0, 0.08, n_samples)
-    source = glottal * 0.85 + breath * 0.15
+    # Aspiration noise (breathiness) — subtle
+    breath = rng.normal(0, 0.04, n_samples)
+    source = glottal * 0.92 + breath * 0.08
 
     # ── Formant filtering ──
     n_vowels = len(vowels_in_lyric)
@@ -1039,6 +1039,12 @@ def vocal_wave(hz, peak=SAMPLE_PEAK, n_samples=SAMPLE_RATE, lyric="ah"):
             ws *= numpy.linspace(0.5, 0, wl)
             out[:wl] = ws * 0.4 + out[:wl] * 0.6
 
+    # Soft edges — prevent clicks at note boundaries
+    fade_samples = min(int(SAMPLE_RATE * 0.01), n_samples // 4)
+    if fade_samples > 0:
+        out[:fade_samples] *= numpy.linspace(0, 1, fade_samples)
+        out[-fade_samples:] *= numpy.linspace(1, 0, fade_samples)
+
     mx = numpy.abs(out).max()
     if mx > 0:
         out /= mx