@@ -3,41 +3,58 @@ import 'dotenv/config'
33import fs from 'node:fs/promises'
44import path from 'node:path'
55
6- import { globby } from 'globby'
76import { OpenAIClient } from 'openai-fetch'
87import pMap from 'p-map'
98
10- import type { ContentChunk } from './types'
9+ import type { BookMetadata , ContentChunk , TocItem } from './types'
1110import { assert , getEnv } from './utils'
1211
1312async function main ( ) {
1413 const asin = getEnv ( 'ASIN' )
1514 assert ( asin , 'ASIN is required' )
1615
1716 const outDir = path . join ( 'out' , asin )
18- const pageScreenshotsDir = path . join ( outDir , 'pages' )
19- const pageScreenshots = await globby ( `${ pageScreenshotsDir } /*.png` )
20- assert ( pageScreenshots . length , 'no page screenshots found' )
17+ const metadata = JSON . parse (
18+ await fs . readFile ( path . join ( outDir , 'metadata.json' ) , 'utf8' )
19+ ) as BookMetadata
20+ assert ( metadata . pages ?. length , 'no page screenshots found' )
21+ assert ( metadata . toc ?. length , 'invalid book metadata: missing toc' )
22+
23+ // eslint-disable-next-line unicorn/no-array-reduce
24+ const pageToTocItemMap = metadata . toc . reduce (
25+ ( acc , tocItem ) => {
26+ if ( tocItem . page !== undefined ) {
27+ acc [ tocItem . page ] = tocItem
28+ }
29+ return acc
30+ } ,
31+ { } as Record < number , TocItem >
32+ )
33+
34+ // const pageScreenshotsDir = path.join(outDir, 'pages')
35+ // const pageScreenshots = await globby(`${pageScreenshotsDir}/*.png`)
36+ // assert(pageScreenshots.length, 'no page screenshots found')
2137
2238 const openai = new OpenAIClient ( )
2339
2440 const content : ContentChunk [ ] = (
2541 await pMap (
26- pageScreenshots ,
27- async ( screenshot ) => {
42+ metadata . pages ,
43+ async ( pageChunk , pageChunkIndex ) => {
44+ const { screenshot, index, page } = pageChunk
2845 const screenshotBuffer = await fs . readFile ( screenshot )
2946 const screenshotBase64 = `data:image/png;base64,${ screenshotBuffer . toString ( 'base64' ) } `
30- const metadataMatch = screenshot . match ( / 0 * ( \d + ) - \0 * ( \d + ) .p n g / )
31- assert (
32- metadataMatch ?. [ 1 ] && metadataMatch ?. [ 2 ] ,
33- `invalid screenshot filename: ${ screenshot } `
34- )
35- const index = Number . parseInt ( metadataMatch [ 1 ] ! , 10 )
36- const page = Number . parseInt ( metadataMatch [ 2 ] ! , 10 )
37- assert (
38- ! Number . isNaN ( index ) && ! Number . isNaN ( page ) ,
39- `invalid screenshot filename: ${ screenshot } `
40- )
47+ // const metadataMatch = screenshot.match(/0*(\d+)-\0*(\d+).png/)
48+ // assert(
49+ // metadataMatch?.[1] && metadataMatch?.[2],
50+ // `invalid screenshot filename: ${screenshot}`
51+ // )
52+ // const index = Number.parseInt(metadataMatch[1]!, 10)
53+ // const page = Number.parseInt(metadataMatch[2]!, 10)
54+ // assert(
55+ // !Number.isNaN(index) && !Number.isNaN(page),
56+ // `invalid screenshot filename: ${screenshot}`
57+ // )
4158
4259 try {
4360 const maxRetries = 20
@@ -69,7 +86,7 @@ Do not include any additional text, descriptions, or punctuation. Ignore any emb
6986 } )
7087
7188 const rawText = res . choices [ 0 ] ! . message . content !
72- const text = rawText
89+ let text = rawText
7390 . replace ( / ^ \s * \d + \s * $ \n + / m, '' )
7491 // .replaceAll(/\n+/g, '\n')
7592 . replaceAll ( / ^ \s * / gm, '' )
@@ -95,6 +112,18 @@ Do not include any additional text, descriptions, or punctuation. Ignore any emb
95112 continue
96113 }
97114
115+ const prevPageChunk = metadata . pages [ pageChunkIndex - 1 ]
116+ if ( prevPageChunk && prevPageChunk . page !== page ) {
117+ const tocItem = pageToTocItemMap [ page ]
118+ if ( tocItem ) {
119+ text = text . replace (
120+ // eslint-disable-next-line security/detect-non-literal-regexp
121+ new RegExp ( `^${ tocItem . label } \\s*` , 'i' ) ,
122+ ''
123+ )
124+ }
125+ }
126+
98127 const result : ContentChunk = {
99128 index,
100129 page,
0 commit comments