Skip to content

Commit 6c9e648

Browse files
committed
🐛 Fix webhookhandler transformPost
1 parent 9113cc2 commit 6c9e648

File tree

5 files changed

+519
-0
lines changed

5 files changed

+519
-0
lines changed

apps/webhook-handler/app/api/webhook/route.ts

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import { NextRequest, NextResponse } from 'next/server';
2+
import * as cheerio from 'cheerio';
23

34
// Set the runtime to edge for better performance
45
export const runtime = 'edge';
@@ -197,11 +198,91 @@ class GhostMeilisearchManager {
197198
const publishedAt = post.published_at ? new Date(post.published_at).getTime() : null;
198199
const updatedAt = post.updated_at ? new Date(post.updated_at).getTime() : null;
199200

201+
// Generate plaintext from HTML
202+
let plaintext = '';
203+
if (post.html) {
204+
try {
205+
// Load HTML into cheerio
206+
const $ = cheerio.load(post.html);
207+
208+
// Remove script and style tags with their content
209+
$('script, style').remove();
210+
211+
// Extract alt text from images and add it to the text
212+
$('img').each((_, el) => {
213+
const alt = $(el).attr('alt');
214+
if (alt) {
215+
$(el).replaceWith(` ${alt} `);
216+
} else {
217+
$(el).remove();
218+
}
219+
});
220+
221+
// Handle special block elements for better formatting
222+
// Add line breaks for block elements to preserve structure
223+
$('p, div, h1, h2, h3, h4, h5, h6, br, hr, blockquote').each((_, el) => {
224+
$(el).append('\n');
225+
});
226+
227+
// Special handling for list items
228+
$('li').each((_, el) => {
229+
$(el).prepend('• ');
230+
$(el).append('\n');
231+
});
232+
233+
// Handle tables - add spacing and structure
234+
$('tr').each((_, el) => {
235+
$(el).append('\n');
236+
});
237+
238+
// Handle links - keep their text
239+
$('a').each((_, el) => {
240+
const href = $(el).attr('href');
241+
const text = $(el).text().trim();
242+
// If the link has text and it's not just the URL, preserve it
243+
if (text && href !== text) {
244+
$(el).replaceWith(` ${text} `);
245+
}
246+
});
247+
248+
// Get the text content of the body
249+
// Cheerio's text() method automatically handles most HTML entities
250+
plaintext = $('body').text();
251+
252+
// Normalize whitespace
253+
plaintext = plaintext.replace(/\s+/g, ' ').trim();
254+
} catch (error) {
255+
// Fallback to simple regex if cheerio parsing fails
256+
console.error('HTML parsing error:', error);
257+
258+
plaintext = post.html
259+
// Remove script and style tags with their content
260+
.replace(/<script\b[^<]*(?:(?!<\/script>)<[^<]*)*<\/script>/gi, ' ')
261+
.replace(/<style\b[^<]*(?:(?!<\/style>)<[^<]*)*<\/style>/gi, ' ')
262+
// Replace link text with its content, preserving spaces
263+
.replace(/<a[^>]*>([^<]*)<\/a>/gi, ' $1 ')
264+
// Replace inline elements with their content, preserving spaces
265+
.replace(/<(strong|b|em|i|mark|span)[^>]*>([^<]*)<\/(strong|b|em|i|mark|span)>/gi, ' $2 ')
266+
// Replace all remaining HTML tags with spaces to preserve word boundaries
267+
.replace(/<[^>]*>/g, ' ')
268+
// Clean up entities and decode HTML entities
269+
.replace(/&nbsp;/g, ' ')
270+
.replace(/&amp;/g, '&')
271+
.replace(/&lt;/g, '<')
272+
.replace(/&gt;/g, '>')
273+
.replace(/&quot;/g, '"')
274+
.replace(/&#39;/g, "'")
275+
// Normalize whitespace
276+
.replace(/\s+/g, ' ').trim();
277+
}
278+
}
279+
200280
return {
201281
id: post.id,
202282
title: post.title,
203283
slug: post.slug,
204284
html: post.html,
285+
plaintext: plaintext,
205286
excerpt: post.excerpt || '',
206287
url: post.url,
207288
feature_image: post.feature_image,

0 commit comments

Comments
 (0)