11import { readdir } from 'node:fs/promises'
22import path from 'node:path'
3+ import type { RecipeFields } from '../src/types/recipe.interface'
34import { isPlainObject , isString } from '../src/utils'
4- import { splitToList } from '../src/utils/parsing'
5+ import { removeInstructionHeading } from '../src/utils/instructions'
6+ import { normalizeString , splitToList } from '../src/utils/parsing'
57
68const INPUT_DIR = path . resolve ( import . meta. dir , '../.temp' )
79const OUTPUT_DIR = path . resolve ( import . meta. dir , '../test-data' )
@@ -34,6 +36,25 @@ const LIST_FIELDS = [
3436 'keywords' ,
3537] as const
3638
39+ // Per-site overrides for known bad data
40+ // These are keyed by hostname, then by the JSON file name without extension
41+ const OVERRIDE_VALUES = {
42+ 'cooking.nytimes.com' : {
43+ nytimes : {
44+ yields : '5 cups (about 120 to 160 crackers)' ,
45+ } ,
46+ } ,
47+ 'epicurious.com' : {
48+ epicurious : {
49+ canonicalUrl :
50+ 'https://www.epicurious.com/recipes/food/views/ramen-noodle-bowl-with-escarole-and-spicy-tofu-crumbles' ,
51+ } ,
52+ } ,
53+ } as const satisfies Record <
54+ string ,
55+ Record < string , Partial < Record < keyof RecipeFields , unknown > > >
56+ >
57+
3758/**
3859 * Returns true if the given path exists and is a directory
3960 */
@@ -73,29 +94,23 @@ export function groupIngredientItems(
7394
7495 for ( const { ingredients, purpose } of input ) {
7596 const title = isString ( purpose ) ? purpose . trim ( ) : 'Ingredients'
76- const items = Array . isArray ( ingredients ) ? ingredients . filter ( isString ) : [ ]
97+ const items = Array . isArray ( ingredients )
98+ ? ingredients . filter ( isString ) . map ( normalizeString )
99+ : [ ]
77100
78101 result [ title ] = items
79102 }
80103
81104 return result
82105}
83106
84- /** Read JSON, normalize keys + defaults, write to outPath */
85- async function processJson ( inPath : string , outPath : string ) {
86- let raw : string
87- let data : Record < string , unknown >
88-
89- try {
90- raw = await Bun . file ( inPath ) . text ( )
91- data = JSON . parse ( raw )
92- } catch {
93- console . error ( `Skipping invalid JSON: ${ inPath } ` )
94- return
95- }
96-
107+ function normalizeData (
108+ host : string ,
109+ filename : string ,
110+ data : Record < string , unknown > ,
111+ ) {
97112 // start with default values
98- const result : Record < string , unknown > = {
113+ let result : Record < string , unknown > = {
99114 ...DEFAULT_VALUES ,
100115 }
101116
@@ -115,14 +130,23 @@ async function processJson(inPath: string, outPath: string) {
115130 result [ prop ] = value
116131 }
117132
118- if (
119- Array . isArray ( result . ingredients ) &&
120- result . ingredients . every ( isIngredientGroup )
121- ) {
122- result . ingredients = groupIngredientItems ( result . ingredients )
133+ // Clean instructions
134+ if ( Array . isArray ( result . instructions ) ) {
135+ result . instructions = result . instructions
136+ . map ( removeInstructionHeading )
137+ . filter ( Boolean )
138+ }
139+
140+ // Clean & group ingredients
141+ if ( Array . isArray ( result . ingredients ) ) {
142+ if ( result . ingredients . every ( isIngredientGroup ) ) {
143+ result . ingredients = groupIngredientItems ( result . ingredients )
144+ } else {
145+ result . ingredients = result . ingredients . map ( normalizeString )
146+ }
123147 }
124148
125- // ensure certain fields are always arrays
149+ // Ensure certain fields are always arrays
126150 for ( const field of LIST_FIELDS ) {
127151 const v = result [ field ]
128152
@@ -131,21 +155,47 @@ async function processJson(inPath: string, outPath: string) {
131155 }
132156 }
133157
134- const output = result
158+ // Apply per-site overrides
159+ const overrides = OVERRIDE_VALUES [ host ] ?. [ filename ]
160+
161+ if ( overrides ) {
162+ result = { ...result , ...overrides }
163+ }
164+
165+ return result
166+ }
167+
168+ /**
169+ * Read JSON, normalize data, write to outPath
170+ */
171+ async function processJson ( host : string , inPath : string , outPath : string ) {
172+ let raw : string
173+ let data : Record < string , unknown >
174+
175+ try {
176+ raw = await Bun . file ( inPath ) . text ( )
177+ data = JSON . parse ( raw )
178+ } catch {
179+ console . error ( `Skipping invalid JSON: ${ inPath } ` )
180+ return
181+ }
182+
183+ const filename = path . basename ( inPath , '.json' )
184+ const output = normalizeData ( host , filename , data )
135185 const content = JSON . stringify ( output , null , 2 )
136186
137187 await Bun . write ( outPath , content )
138188}
139189
140190/** Recursively traverse input directory, mirroring structure in output dir */
141- async function traverse ( inDir : string , outDir : string ) {
191+ async function traverse ( host : string , inDir : string , outDir : string ) {
142192 for ( const entry of await readdir ( inDir , { withFileTypes : true } ) ) {
143193 const inPath = path . join ( inDir , entry . name )
144194 const outPath = path . join ( outDir , entry . name )
145195 const relativePath = outPath . substring ( OUTPUT_DIR . length + 1 )
146196
147197 if ( entry . isDirectory ( ) ) {
148- await traverse ( inPath , outPath )
198+ await traverse ( host , inPath , outPath )
149199 } else if ( entry . isFile ( ) ) {
150200 const exists = await Bun . file ( outPath ) . exists ( )
151201
@@ -155,7 +205,7 @@ async function traverse(inDir: string, outDir: string) {
155205 }
156206
157207 if ( entry . name . endsWith ( '.json' ) ) {
158- await processJson ( inPath , outPath )
208+ await processJson ( host , inPath , outPath )
159209 console . log ( `Processed: ${ relativePath } ` )
160210 } else {
161211 // copy non-JSON files unchanged
@@ -177,7 +227,7 @@ async function main(host: string | undefined) {
177227 return
178228 }
179229
180- await traverse ( inDir , outDir )
230+ await traverse ( host , inDir , outDir )
181231 } else {
182232 console . error ( 'Usage: bun process-test-data <host>' )
183233 }
0 commit comments