From ff7a6b579016fb2191a21597dbc8ec60495b84e4 Mon Sep 17 00:00:00 2001 From: Kenneth Reitz Date: Sun, 7 Jun 2026 12:56:17 -0400 Subject: [PATCH] Google search-page dumps extract to just the lyrics MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Third extractor: the knowledge panel's standalone "Lyrics" header through the Source:/Songwriters: footer — everything after (result snippets, videos, People-also-search-for) drops. Requires both markers, so a poem with a "Lyrics" line passes through whole. Co-Authored-By: Claude Opus 4.8 (1M context) --- static/index.html | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/static/index.html b/static/index.html index 180dbdd..074f661 100644 --- a/static/index.html +++ b/static/index.html @@ -997,13 +997,32 @@ function extractAZBlock(t){ const block = lines.slice(start + 1, end).join('\n').trim(); return block ? block + '\n' : null; } +function extractGoogleBlock(t){ + // Google's knowledge panel: a standalone "Lyrics" header, the verse, + // then a Source:/Songwriters: footer — search-result noise follows + const lines = t.split('\n'); + let start = -1; + for(let i = 0; i < lines.length; i++){ + if(lines[i].trim() === 'Lyrics'){ start = i; break; } + } + if(start < 0) return null; + while(start + 1 < lines.length && + (lines[start + 1].trim() === 'Lyrics' || !lines[start + 1].trim())) start++; + let end = -1; + for(let i = start + 1; i < lines.length; i++){ + if(/^(Source:\s|Songwriters?:)/i.test(lines[i].trim())){ end = i; break; } + } + if(end < 0) return null; + const block = lines.slice(start + 1, end).join('\n').trim(); + return block ? block + '\n' : null; +} function cleanPaste(t){ t = t.replace(/\r\n?/g, '\n') .replace(/[\u2028\u2029\u0085\u000B\u000C]/g, '\n') // Google's lyrics box .replace(/[\u2018\u2019\u02BC]/g, "'") .replace(/[\u201C\u201D]/g, '"') .replace(/\u00A0/g, ' '); - const extracted = extractLyricBlock(t) || extractAZBlock(t); + const extracted = extractLyricBlock(t) || extractGoogleBlock(t) || extractAZBlock(t); if(extracted) return extracted; const out = []; t.split('\n').forEach((l, i)=>{