Skip to content

Commit beb247e

Browse files
🚐
1 parent bec6fee commit beb247e

File tree

3 files changed

+63
-43
lines changed

3 files changed

+63
-43
lines changed

src/extract-kindle-book.ts

Lines changed: 59 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -3,15 +3,14 @@ import 'dotenv/config'
33
import fs from 'node:fs/promises'
44
import path from 'node:path'
55

6-
import type { SetOptional } from 'type-fest'
76
import { input } from '@inquirer/prompts'
87
import delay from 'delay'
98
import pRace from 'p-race'
109
// import { chromium } from 'playwright'
1110
import { chromium } from 'patchright'
1211
import sharp from 'sharp'
1312

14-
import type { $TocItem, BookMetadata } from './types'
13+
import type { BookMetadata, TocItem } from './types'
1514
import { parsePageNav, parseTocItems } from './playwright-utils'
1615
import { assert, getEnv, normalizeAuthors, parseJsonpResponse } from './utils'
1716

@@ -31,6 +30,7 @@ async function main() {
3130
const asin = getEnv('ASIN')
3231
const amazonEmail = getEnv('AMAZON_EMAIL')
3332
const amazonPassword = getEnv('AMAZON_PASSWORD')
33+
const force = !!getEnv('FORCE')
3434
assert(asin, 'ASIN is required')
3535
assert(amazonEmail, 'AMAZON_EMAIL is required')
3636
assert(amazonPassword, 'AMAZON_PASSWORD is required')
@@ -39,16 +39,28 @@ async function main() {
3939
const outDir = path.join('out', asin)
4040
const userDataDir = path.join(outDir, 'data')
4141
const pageScreenshotsDir = path.join(outDir, 'pages')
42+
const metadataPath = path.join(outDir, 'metadata.json')
4243
await fs.mkdir(userDataDir, { recursive: true })
4344
await fs.mkdir(pageScreenshotsDir, { recursive: true })
4445

4546
const krRendererMainImageSelector = '#kr-renderer .kg-full-page-img img'
4647
const bookReaderUrl = `https://read.amazon.com/?asin=${asin}`
4748

48-
const result: SetOptional<BookMetadata, 'info' | 'meta'> = {
49+
const result: BookMetadata = {
50+
meta: {} as any,
51+
info: {} as any,
4952
toc: [],
5053
pages: []
5154
}
55+
let prevBookMetadata: Partial<BookMetadata> = {}
56+
57+
if (!force) {
58+
try {
59+
prevBookMetadata = JSON.parse(
60+
await fs.readFile(metadataPath, 'utf8')
61+
) as Partial<BookMetadata>
62+
} catch {}
63+
}
5264

5365
const context = await chromium.launchPersistentContext(userDataDir, {
5466
headless: false,
@@ -101,14 +113,26 @@ async function main() {
101113
const body = await response.text()
102114
const metadata = parseJsonpResponse<any>(body)
103115
if (metadata.asin !== asin) return
116+
104117
delete metadata.cpr
105118
if (Array.isArray(metadata.authorsList)) {
106119
metadata.authorsList = normalizeAuthors(metadata.authorsList)
107120
}
108-
if (!result.meta) {
109-
console.warn('book meta', metadata)
121+
122+
if (!Object.keys(result.meta).length) {
123+
if (
124+
metadata.version &&
125+
metadata.version === prevBookMetadata.meta?.version
126+
) {
127+
if (!result.toc.length && prevBookMetadata.toc?.length) {
128+
// Use previously extracted TOC
129+
console.warn('using cached TOC', prevBookMetadata.toc)
130+
result.toc = prevBookMetadata.toc
131+
}
132+
}
133+
134+
result.meta = metadata
110135
}
111-
result.meta = metadata
112136
} else if (
113137
url.hostname === 'read.amazon.com' &&
114138
url.searchParams.get('asin')?.toLowerCase() === asinL
@@ -118,12 +142,12 @@ async function main() {
118142
delete body.karamelToken
119143
delete body.metadataUrl
120144
delete body.YJFormatVersion
121-
if (!result.info) {
145+
if (!Object.keys(result.info).length) {
122146
console.warn('book info', body)
123147
}
124148
result.info = body
125149
} else if (url.pathname === '/renderer/render') {
126-
// TODO
150+
// TODO: these TAR files have some useful metadata that we could use...
127151
// const body = await response.body()
128152
// const tempDir = await extractTarToTemp(body)
129153
// const toc = JSON.parse(
@@ -224,7 +248,7 @@ async function main() {
224248

225249
async function updateSettings() {
226250
await page.locator('ion-button[aria-label="Reader settings"]').click()
227-
await delay(1000)
251+
await delay(500)
228252

229253
// Change font to Amazon Ember
230254
// My hypothesis is that this font will be easier for OCR to transcribe...
@@ -239,7 +263,7 @@ async function main() {
239263
.click()
240264

241265
await page.locator('ion-button[aria-label="Reader settings"]').click()
242-
await delay(1000)
266+
await delay(500)
243267
}
244268

245269
async function goToPage(pageNumber: number) {
@@ -257,7 +281,7 @@ async function main() {
257281
await page
258282
.locator('ion-modal ion-button[item-i-d="go-to-modal-go-button"]')
259283
.click()
260-
await delay(1000)
284+
await delay(500)
261285
}
262286

263287
async function getPageNav() {
@@ -283,21 +307,18 @@ async function main() {
283307
}
284308

285309
async function writeResultMetadata() {
286-
return fs.writeFile(
287-
path.join(outDir, 'metadata.json'),
288-
JSON.stringify(result, null, 2)
289-
)
310+
return fs.writeFile(metadataPath, JSON.stringify(result, null, 2))
290311
}
291312

292313
await dismissPossibleAlert()
293314
await ensureFixedHeaderUI()
294315
await updateSettings()
295316

296317
const initialPageNav = await getPageNav()
297-
let totalPages = 5
298-
let totalContentPages = 5
299318

300-
{
319+
if (!force && result.toc.length) {
320+
// Using a cached table of contents
321+
} else {
301322
// Extract the table of contents
302323
await page.locator('ion-button[aria-label="Table of Contents"]').click()
303324
await delay(2000)
@@ -306,7 +327,7 @@ async function main() {
306327
const $tocTopLevelItems = await page
307328
.locator('ion-list > div > ion-item')
308329
.all()
309-
const tocItems: Array<$TocItem> = []
330+
const tocItems: Array<TocItem> = []
310331

311332
console.warn(`initializing ${numTocItems} TOC items...`)
312333

@@ -333,10 +354,9 @@ async function main() {
333354
const pageNav = await getPageNav()
334355
assert(pageNav)
335356

336-
const currentTocItem: $TocItem = {
357+
const currentTocItem: TocItem = {
337358
label,
338-
...pageNav,
339-
locator: $tocItem
359+
...pageNav
340360
}
341361
tocItems.push(currentTocItem)
342362

@@ -377,29 +397,32 @@ async function main() {
377397
}
378398
}
379399

380-
const { locator: _, ...debugTocItem } = currentTocItem
381-
console.warn(debugTocItem)
400+
console.warn(currentTocItem)
382401
}
383402

384-
const parsedToc = parseTocItems(tocItems)
385-
console.log('parsed TOC', parsedToc)
386-
result.toc = tocItems.map(({ locator: _, ...tocItem }) => tocItem)
403+
result.toc = tocItems
387404

388-
totalPages = parsedToc.firstContentPageTocItem.total
389-
totalContentPages = Math.min(
390-
parsedToc.firstPostContentPageTocItem?.page ?? totalPages,
391-
totalPages
392-
)
393-
assert(totalContentPages > 0, 'No content pages found')
405+
await page.locator('.side-menu-close-button').click()
406+
await delay(500)
394407

395408
// Navigate to the first content page of the book
396-
await parsedToc.firstContentPageTocItem.locator!.click()
409+
// await parsedToc.firstContentPageTocItem.locator!.click()
397410
}
398411

399-
await page.locator('.side-menu-close-button').click()
400-
await delay(1000)
412+
const parsedToc = parseTocItems(result.toc)
401413

414+
const totalPages = parsedToc.firstContentPageTocItem.total
415+
const totalContentPages = Math.min(
416+
parsedToc.firstPostContentPageTocItem?.page ?? totalPages,
417+
totalPages
418+
)
419+
assert(totalContentPages > 0, 'No content pages found')
402420
const pageNumberPaddingAmount = `${totalContentPages * 2}`.length
421+
await writeResultMetadata()
422+
423+
// Navigate to the first content page of the book
424+
await goToPage(parsedToc.firstContentPageTocItem.page! ?? 1)
425+
403426
let maxPageSeen = -1
404427
let done = false
405428
console.warn(

src/playwright-utils.ts

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/* eslint-disable @typescript-eslint/no-non-null-asserted-optional-chain */
2-
import type { $TocItem, PageNav } from './types'
2+
import type { PageNav, TocItem } from './types'
33
import { assert, deromanize } from './utils'
44

55
export function parsePageNav(text: string | null): PageNav | undefined {
@@ -47,9 +47,9 @@ export function parsePageNav(text: string | null): PageNav | undefined {
4747
}
4848
}
4949

50-
export function parseTocItems(tocItems: $TocItem[]): {
51-
firstContentPageTocItem: $TocItem
52-
firstPostContentPageTocItem?: $TocItem
50+
export function parseTocItems(tocItems: TocItem[]): {
51+
firstContentPageTocItem: TocItem
52+
firstPostContentPageTocItem?: TocItem
5353
} {
5454
const flatTocItems = tocItems.flatMap((item) => [
5555
item,

src/types.ts

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
import type { Locator } from 'patchright'
21
import type { Simplify, SimplifyDeep } from 'type-fest'
32

43
export interface BookMetadata {
@@ -36,8 +35,6 @@ export type TocItem = SimplifyDeep<
3635
}
3736
>
3837

39-
export type $TocItem = Simplify<TocItem & { locator?: Locator }>
40-
4138
/** Amazon's YT Metadata */
4239
export interface BookMeta {
4340
ACR: string

0 commit comments

Comments
 (0)