|
1 | 1 | import { NextRequest, NextResponse } from 'next/server'; |
| 2 | +import * as cheerio from 'cheerio'; |
2 | 3 |
|
3 | 4 | // Set the runtime to edge for better performance |
4 | 5 | export const runtime = 'edge'; |
@@ -197,11 +198,91 @@ class GhostMeilisearchManager { |
197 | 198 | const publishedAt = post.published_at ? new Date(post.published_at).getTime() : null; |
198 | 199 | const updatedAt = post.updated_at ? new Date(post.updated_at).getTime() : null; |
199 | 200 |
|
| 201 | + // Generate plaintext from HTML |
| 202 | + let plaintext = ''; |
| 203 | + if (post.html) { |
| 204 | + try { |
| 205 | + // Load HTML into cheerio |
| 206 | + const $ = cheerio.load(post.html); |
| 207 | + |
| 208 | + // Remove script and style tags with their content |
| 209 | + $('script, style').remove(); |
| 210 | + |
| 211 | + // Extract alt text from images and add it to the text |
| 212 | + $('img').each((_, el) => { |
| 213 | + const alt = $(el).attr('alt'); |
| 214 | + if (alt) { |
| 215 | + $(el).replaceWith(` ${alt} `); |
| 216 | + } else { |
| 217 | + $(el).remove(); |
| 218 | + } |
| 219 | + }); |
| 220 | + |
| 221 | + // Handle special block elements for better formatting |
| 222 | + // Add line breaks for block elements to preserve structure |
| 223 | + $('p, div, h1, h2, h3, h4, h5, h6, br, hr, blockquote').each((_, el) => { |
| 224 | + $(el).append('\n'); |
| 225 | + }); |
| 226 | + |
| 227 | + // Special handling for list items |
| 228 | + $('li').each((_, el) => { |
| 229 | + $(el).prepend('• '); |
| 230 | + $(el).append('\n'); |
| 231 | + }); |
| 232 | + |
| 233 | + // Handle tables - add spacing and structure |
| 234 | + $('tr').each((_, el) => { |
| 235 | + $(el).append('\n'); |
| 236 | + }); |
| 237 | + |
| 238 | + // Handle links - keep their text |
| 239 | + $('a').each((_, el) => { |
| 240 | + const href = $(el).attr('href'); |
| 241 | + const text = $(el).text().trim(); |
| 242 | + // If the link has text and it's not just the URL, preserve it |
| 243 | + if (text && href !== text) { |
| 244 | + $(el).replaceWith(` ${text} `); |
| 245 | + } |
| 246 | + }); |
| 247 | + |
| 248 | + // Get the text content of the body |
| 249 | + // Cheerio's text() method automatically handles most HTML entities |
| 250 | + plaintext = $('body').text(); |
| 251 | + |
| 252 | + // Normalize whitespace |
| 253 | + plaintext = plaintext.replace(/\s+/g, ' ').trim(); |
| 254 | + } catch (error) { |
| 255 | + // Fallback to simple regex if cheerio parsing fails |
| 256 | + console.error('HTML parsing error:', error); |
| 257 | + |
| 258 | + plaintext = post.html |
| 259 | + // Remove script and style tags with their content |
| 260 | + .replace(/<script\b[^<]*(?:(?!<\/script>)<[^<]*)*<\/script>/gi, ' ') |
| 261 | + .replace(/<style\b[^<]*(?:(?!<\/style>)<[^<]*)*<\/style>/gi, ' ') |
| 262 | + // Replace link text with its content, preserving spaces |
| 263 | + .replace(/<a[^>]*>([^<]*)<\/a>/gi, ' $1 ') |
| 264 | + // Replace inline elements with their content, preserving spaces |
| 265 | + .replace(/<(strong|b|em|i|mark|span)[^>]*>([^<]*)<\/(strong|b|em|i|mark|span)>/gi, ' $2 ') |
| 266 | + // Replace all remaining HTML tags with spaces to preserve word boundaries |
| 267 | + .replace(/<[^>]*>/g, ' ') |
| 268 | + // Clean up entities and decode HTML entities |
| 269 | + .replace(/ /g, ' ') |
| 270 | + .replace(/&/g, '&') |
| 271 | + .replace(/</g, '<') |
| 272 | + .replace(/>/g, '>') |
| 273 | + .replace(/"/g, '"') |
| 274 | + .replace(/'/g, "'") |
| 275 | + // Normalize whitespace |
| 276 | + .replace(/\s+/g, ' ').trim(); |
| 277 | + } |
| 278 | + } |
| 279 | + |
200 | 280 | return { |
201 | 281 | id: post.id, |
202 | 282 | title: post.title, |
203 | 283 | slug: post.slug, |
204 | 284 | html: post.html, |
| 285 | + plaintext: plaintext, |
205 | 286 | excerpt: post.excerpt || '', |
206 | 287 | url: post.url, |
207 | 288 | feature_image: post.feature_image, |
|
0 commit comments