@@ -3,15 +3,14 @@ import 'dotenv/config'
33import fs from 'node:fs/promises'
44import path from 'node:path'
55
6- import type { SetOptional } from 'type-fest'
76import { input } from '@inquirer/prompts'
87import delay from 'delay'
98import pRace from 'p-race'
109// import { chromium } from 'playwright'
1110import { chromium } from 'patchright'
1211import sharp from 'sharp'
1312
14- import type { $TocItem , BookMetadata } from './types'
13+ import type { BookMetadata , TocItem } from './types'
1514import { parsePageNav , parseTocItems } from './playwright-utils'
1615import { assert , getEnv , normalizeAuthors , parseJsonpResponse } from './utils'
1716
@@ -31,6 +30,7 @@ async function main() {
3130 const asin = getEnv ( 'ASIN' )
3231 const amazonEmail = getEnv ( 'AMAZON_EMAIL' )
3332 const amazonPassword = getEnv ( 'AMAZON_PASSWORD' )
33+ const force = ! ! getEnv ( 'FORCE' )
3434 assert ( asin , 'ASIN is required' )
3535 assert ( amazonEmail , 'AMAZON_EMAIL is required' )
3636 assert ( amazonPassword , 'AMAZON_PASSWORD is required' )
@@ -39,16 +39,28 @@ async function main() {
3939 const outDir = path . join ( 'out' , asin )
4040 const userDataDir = path . join ( outDir , 'data' )
4141 const pageScreenshotsDir = path . join ( outDir , 'pages' )
42+ const metadataPath = path . join ( outDir , 'metadata.json' )
4243 await fs . mkdir ( userDataDir , { recursive : true } )
4344 await fs . mkdir ( pageScreenshotsDir , { recursive : true } )
4445
4546 const krRendererMainImageSelector = '#kr-renderer .kg-full-page-img img'
4647 const bookReaderUrl = `https://read.amazon.com/?asin=${ asin } `
4748
48- const result : SetOptional < BookMetadata , 'info' | 'meta' > = {
49+ const result : BookMetadata = {
50+ meta : { } as any ,
51+ info : { } as any ,
4952 toc : [ ] ,
5053 pages : [ ]
5154 }
55+ let prevBookMetadata : Partial < BookMetadata > = { }
56+
57+ if ( ! force ) {
58+ try {
59+ prevBookMetadata = JSON . parse (
60+ await fs . readFile ( metadataPath , 'utf8' )
61+ ) as Partial < BookMetadata >
62+ } catch { }
63+ }
5264
5365 const context = await chromium . launchPersistentContext ( userDataDir , {
5466 headless : false ,
@@ -101,14 +113,26 @@ async function main() {
101113 const body = await response . text ( )
102114 const metadata = parseJsonpResponse < any > ( body )
103115 if ( metadata . asin !== asin ) return
116+
104117 delete metadata . cpr
105118 if ( Array . isArray ( metadata . authorsList ) ) {
106119 metadata . authorsList = normalizeAuthors ( metadata . authorsList )
107120 }
108- if ( ! result . meta ) {
109- console . warn ( 'book meta' , metadata )
121+
122+ if ( ! Object . keys ( result . meta ) . length ) {
123+ if (
124+ metadata . version &&
125+ metadata . version === prevBookMetadata . meta ?. version
126+ ) {
127+ if ( ! result . toc . length && prevBookMetadata . toc ?. length ) {
128+ // Use previously extracted TOC
129+ console . warn ( 'using cached TOC' , prevBookMetadata . toc )
130+ result . toc = prevBookMetadata . toc
131+ }
132+ }
133+
134+ result . meta = metadata
110135 }
111- result . meta = metadata
112136 } else if (
113137 url . hostname === 'read.amazon.com' &&
114138 url . searchParams . get ( 'asin' ) ?. toLowerCase ( ) === asinL
@@ -118,12 +142,12 @@ async function main() {
118142 delete body . karamelToken
119143 delete body . metadataUrl
120144 delete body . YJFormatVersion
121- if ( ! result . info ) {
145+ if ( ! Object . keys ( result . info ) . length ) {
122146 console . warn ( 'book info' , body )
123147 }
124148 result . info = body
125149 } else if ( url . pathname === '/renderer/render' ) {
126- // TODO
150+ // TODO: these TAR files have some useful metadata that we could use...
127151 // const body = await response.body()
128152 // const tempDir = await extractTarToTemp(body)
129153 // const toc = JSON.parse(
@@ -224,7 +248,7 @@ async function main() {
224248
225249 async function updateSettings ( ) {
226250 await page . locator ( 'ion-button[aria-label="Reader settings"]' ) . click ( )
227- await delay ( 1000 )
251+ await delay ( 500 )
228252
229253 // Change font to Amazon Ember
230254 // My hypothesis is that this font will be easier for OCR to transcribe...
@@ -239,7 +263,7 @@ async function main() {
239263 . click ( )
240264
241265 await page . locator ( 'ion-button[aria-label="Reader settings"]' ) . click ( )
242- await delay ( 1000 )
266+ await delay ( 500 )
243267 }
244268
245269 async function goToPage ( pageNumber : number ) {
@@ -257,7 +281,7 @@ async function main() {
257281 await page
258282 . locator ( 'ion-modal ion-button[item-i-d="go-to-modal-go-button"]' )
259283 . click ( )
260- await delay ( 1000 )
284+ await delay ( 500 )
261285 }
262286
263287 async function getPageNav ( ) {
@@ -283,21 +307,18 @@ async function main() {
283307 }
284308
285309 async function writeResultMetadata ( ) {
286- return fs . writeFile (
287- path . join ( outDir , 'metadata.json' ) ,
288- JSON . stringify ( result , null , 2 )
289- )
310+ return fs . writeFile ( metadataPath , JSON . stringify ( result , null , 2 ) )
290311 }
291312
292313 await dismissPossibleAlert ( )
293314 await ensureFixedHeaderUI ( )
294315 await updateSettings ( )
295316
296317 const initialPageNav = await getPageNav ( )
297- let totalPages = 5
298- let totalContentPages = 5
299318
300- {
319+ if ( ! force && result . toc . length ) {
320+ // Using a cached table of contents
321+ } else {
301322 // Extract the table of contents
302323 await page . locator ( 'ion-button[aria-label="Table of Contents"]' ) . click ( )
303324 await delay ( 2000 )
@@ -306,7 +327,7 @@ async function main() {
306327 const $tocTopLevelItems = await page
307328 . locator ( 'ion-list > div > ion-item' )
308329 . all ( )
309- const tocItems : Array < $ TocItem> = [ ]
330+ const tocItems : Array < TocItem > = [ ]
310331
311332 console . warn ( `initializing ${ numTocItems } TOC items...` )
312333
@@ -333,10 +354,9 @@ async function main() {
333354 const pageNav = await getPageNav ( )
334355 assert ( pageNav )
335356
336- const currentTocItem : $ TocItem = {
357+ const currentTocItem : TocItem = {
337358 label,
338- ...pageNav ,
339- locator : $tocItem
359+ ...pageNav
340360 }
341361 tocItems . push ( currentTocItem )
342362
@@ -377,29 +397,32 @@ async function main() {
377397 }
378398 }
379399
380- const { locator : _ , ...debugTocItem } = currentTocItem
381- console . warn ( debugTocItem )
400+ console . warn ( currentTocItem )
382401 }
383402
384- const parsedToc = parseTocItems ( tocItems )
385- console . log ( 'parsed TOC' , parsedToc )
386- result . toc = tocItems . map ( ( { locator : _ , ...tocItem } ) => tocItem )
403+ result . toc = tocItems
387404
388- totalPages = parsedToc . firstContentPageTocItem . total
389- totalContentPages = Math . min (
390- parsedToc . firstPostContentPageTocItem ?. page ?? totalPages ,
391- totalPages
392- )
393- assert ( totalContentPages > 0 , 'No content pages found' )
405+ await page . locator ( '.side-menu-close-button' ) . click ( )
406+ await delay ( 500 )
394407
395408 // Navigate to the first content page of the book
396- await parsedToc . firstContentPageTocItem . locator ! . click ( )
409+ // await parsedToc.firstContentPageTocItem.locator!.click()
397410 }
398411
399- await page . locator ( '.side-menu-close-button' ) . click ( )
400- await delay ( 1000 )
412+ const parsedToc = parseTocItems ( result . toc )
401413
414+ const totalPages = parsedToc . firstContentPageTocItem . total
415+ const totalContentPages = Math . min (
416+ parsedToc . firstPostContentPageTocItem ?. page ?? totalPages ,
417+ totalPages
418+ )
419+ assert ( totalContentPages > 0 , 'No content pages found' )
402420 const pageNumberPaddingAmount = `${ totalContentPages * 2 } ` . length
421+ await writeResultMetadata ( )
422+
423+ // Navigate to the first content page of the book
424+ await goToPage ( parsedToc . firstContentPageTocItem . page ! ?? 1 )
425+
403426 let maxPageSeen = - 1
404427 let done = false
405428 console . warn (
0 commit comments