Skip to content

Commit 4385d9e

Browse files
committed
chore: per site overrides in test data processing
1 parent d9b2637 commit 4385d9e

File tree

2 files changed

+78
-28
lines changed

2 files changed

+78
-28
lines changed

scripts/process-test-data.ts

Lines changed: 77 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
11
import { readdir } from 'node:fs/promises'
22
import path from 'node:path'
3+
import type { RecipeFields } from '../src/types/recipe.interface'
34
import { isPlainObject, isString } from '../src/utils'
4-
import { splitToList } from '../src/utils/parsing'
5+
import { removeInstructionHeading } from '../src/utils/instructions'
6+
import { normalizeString, splitToList } from '../src/utils/parsing'
57

68
const INPUT_DIR = path.resolve(import.meta.dir, '../.temp')
79
const OUTPUT_DIR = path.resolve(import.meta.dir, '../test-data')
@@ -34,6 +36,25 @@ const LIST_FIELDS = [
3436
'keywords',
3537
] as const
3638

39+
// Per-site overrides for known bad data
40+
// These are keyed by hostname, then by the JSON file name without extension
41+
const OVERRIDE_VALUES = {
42+
'cooking.nytimes.com': {
43+
nytimes: {
44+
yields: '5 cups (about 120 to 160 crackers)',
45+
},
46+
},
47+
'epicurious.com': {
48+
epicurious: {
49+
canonicalUrl:
50+
'https://www.epicurious.com/recipes/food/views/ramen-noodle-bowl-with-escarole-and-spicy-tofu-crumbles',
51+
},
52+
},
53+
} as const satisfies Record<
54+
string,
55+
Record<string, Partial<Record<keyof RecipeFields, unknown>>>
56+
>
57+
3758
/**
3859
* Returns true if the given path exists and is a directory
3960
*/
@@ -73,29 +94,23 @@ export function groupIngredientItems(
7394

7495
for (const { ingredients, purpose } of input) {
7596
const title = isString(purpose) ? purpose.trim() : 'Ingredients'
76-
const items = Array.isArray(ingredients) ? ingredients.filter(isString) : []
97+
const items = Array.isArray(ingredients)
98+
? ingredients.filter(isString).map(normalizeString)
99+
: []
77100

78101
result[title] = items
79102
}
80103

81104
return result
82105
}
83106

84-
/** Read JSON, normalize keys + defaults, write to outPath */
85-
async function processJson(inPath: string, outPath: string) {
86-
let raw: string
87-
let data: Record<string, unknown>
88-
89-
try {
90-
raw = await Bun.file(inPath).text()
91-
data = JSON.parse(raw)
92-
} catch {
93-
console.error(`Skipping invalid JSON: ${inPath}`)
94-
return
95-
}
96-
107+
function normalizeData(
108+
host: string,
109+
filename: string,
110+
data: Record<string, unknown>,
111+
) {
97112
// start with default values
98-
const result: Record<string, unknown> = {
113+
let result: Record<string, unknown> = {
99114
...DEFAULT_VALUES,
100115
}
101116

@@ -115,14 +130,23 @@ async function processJson(inPath: string, outPath: string) {
115130
result[prop] = value
116131
}
117132

118-
if (
119-
Array.isArray(result.ingredients) &&
120-
result.ingredients.every(isIngredientGroup)
121-
) {
122-
result.ingredients = groupIngredientItems(result.ingredients)
133+
// Clean instructions
134+
if (Array.isArray(result.instructions)) {
135+
result.instructions = result.instructions
136+
.map(removeInstructionHeading)
137+
.filter(Boolean)
138+
}
139+
140+
// Clean & group ingredients
141+
if (Array.isArray(result.ingredients)) {
142+
if (result.ingredients.every(isIngredientGroup)) {
143+
result.ingredients = groupIngredientItems(result.ingredients)
144+
} else {
145+
result.ingredients = result.ingredients.map(normalizeString)
146+
}
123147
}
124148

125-
// ensure certain fields are always arrays
149+
// Ensure certain fields are always arrays
126150
for (const field of LIST_FIELDS) {
127151
const v = result[field]
128152

@@ -131,21 +155,47 @@ async function processJson(inPath: string, outPath: string) {
131155
}
132156
}
133157

134-
const output = result
158+
// Apply per-site overrides
159+
const overrides = OVERRIDE_VALUES[host]?.[filename]
160+
161+
if (overrides) {
162+
result = { ...result, ...overrides }
163+
}
164+
165+
return result
166+
}
167+
168+
/**
169+
* Read JSON, normalize data, write to outPath
170+
*/
171+
async function processJson(host: string, inPath: string, outPath: string) {
172+
let raw: string
173+
let data: Record<string, unknown>
174+
175+
try {
176+
raw = await Bun.file(inPath).text()
177+
data = JSON.parse(raw)
178+
} catch {
179+
console.error(`Skipping invalid JSON: ${inPath}`)
180+
return
181+
}
182+
183+
const filename = path.basename(inPath, '.json')
184+
const output = normalizeData(host, filename, data)
135185
const content = JSON.stringify(output, null, 2)
136186

137187
await Bun.write(outPath, content)
138188
}
139189

140190
/** Recursively traverse input directory, mirroring structure in output dir */
141-
async function traverse(inDir: string, outDir: string) {
191+
async function traverse(host: string, inDir: string, outDir: string) {
142192
for (const entry of await readdir(inDir, { withFileTypes: true })) {
143193
const inPath = path.join(inDir, entry.name)
144194
const outPath = path.join(outDir, entry.name)
145195
const relativePath = outPath.substring(OUTPUT_DIR.length + 1)
146196

147197
if (entry.isDirectory()) {
148-
await traverse(inPath, outPath)
198+
await traverse(host, inPath, outPath)
149199
} else if (entry.isFile()) {
150200
const exists = await Bun.file(outPath).exists()
151201

@@ -155,7 +205,7 @@ async function traverse(inDir: string, outDir: string) {
155205
}
156206

157207
if (entry.name.endsWith('.json')) {
158-
await processJson(inPath, outPath)
208+
await processJson(host, inPath, outPath)
159209
console.log(`Processed: ${relativePath}`)
160210
} else {
161211
// copy non-JSON files unchanged
@@ -177,7 +227,7 @@ async function main(host: string | undefined) {
177227
return
178228
}
179229

180-
await traverse(inDir, outDir)
230+
await traverse(host, inDir, outDir)
181231
} else {
182232
console.error('Usage: bun process-test-data <host>')
183233
}

src/utils/instructions.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ const INSTRUCTION_HEADINGS = [
1414
/**
1515
* Removes any heading from the start of the instructions string.
1616
*/
17-
function removeInstructionHeading(value: string) {
17+
export function removeInstructionHeading(value: string) {
1818
for (const heading of INSTRUCTION_HEADINGS) {
1919
const regex = new RegExp(`^\\s*${heading}\\s*:?\\s*`, 'i')
2020
if (regex.test(value)) {

0 commit comments

Comments
 (0)