Skip to content

Commit ea1b134

Browse files
feat: improved toc; fixes
1 parent d92c501 commit ea1b134

File tree

7 files changed

+115
-38
lines changed

7 files changed

+115
-38
lines changed

package.json

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
{
22
"name": "kindle-ai-export",
33
"private": true,
4-
"version": "0.1.0",
4+
"version": "0.2.0",
55
"description": "Export any Kindle book you own as text, PDF, EPUB, or as a custom, AI-narrated audiobook.",
66
"author": "Travis Fischer <[email protected]>",
77
"license": "MIT",
@@ -25,10 +25,8 @@
2525
"@inquirer/prompts": "^7.0.0",
2626
"dotenv": "^17.2.3",
2727
"fluent-ffmpeg": "^2.1.3",
28-
"globby": "^15.0.0",
2928
"hash-object": "^5.0.1",
3029
"hh-mm-ss": "^1.2.0",
31-
"kindle-api-ky": "^1.0.1",
3230
"ky": "^1.12.0",
3331
"node-id3": "^0.2.6",
3432
"openai-fetch": "^3.4.2",

src/export-book-markdown.ts

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ async function main() {
4141

4242
let output = `# ${title}
4343
44-
By ${authors.join(', ')}
44+
> By ${authors.join(', ')}
4545
4646
---
4747
@@ -53,7 +53,7 @@ ${metadata.toc
5353
)
5454
.map(
5555
(tocItem) =>
56-
`- [${tocItem.label}](#${tocItem.label.toLowerCase().replaceAll(/[^\da-z]+/g, '-')})`
56+
`${' '.repeat(tocItem.depth)}- [${tocItem.label}](#${tocItem.label.toLowerCase().replaceAll(/[^\da-z]+/g, '-')})`
5757
)
5858
.join('\n')}
5959
@@ -78,7 +78,7 @@ ${metadata.toc
7878

7979
output += `
8080
81-
## ${tocItem.label}
81+
${'#'.repeat(tocItem.depth + 2)} ${tocItem.label}
8282
8383
${text}`
8484

src/export-book-pdf.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,7 @@ async function main() {
8484
const text = chunks.map((chunk) => chunk.text).join(' ')
8585

8686
;(doc as any).outline.addItem(tocItem.label)
87-
doc.fontSize(20)
87+
doc.fontSize(tocItem.depth === 1 ? 16 : 20)
8888
doc.text(tocItem.label, { align: 'center', lineGap: 16 })
8989

9090
doc.fontSize(fontSize)

src/extract-kindle-book.ts

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -325,6 +325,7 @@ async function main() {
325325

326326
const numTocItems = await page.locator('ion-list ion-item').count()
327327
const $tocTopLevelItems = await page
328+
// TODO: this is pretty brittle
328329
.locator('ion-list > div > ion-item')
329330
.all()
330331
const tocItems: Array<TocItem> = []
@@ -356,9 +357,11 @@ async function main() {
356357

357358
const currentTocItem: TocItem = {
358359
label,
360+
depth: 0,
359361
...pageNav
360362
}
361363
tocItems.push(currentTocItem)
364+
console.warn(currentTocItem)
362365

363366
// if (pageNav.page !== undefined) {
364367
// // TODO: this assumes the toc items are in order and contiguous...
@@ -372,7 +375,6 @@ async function main() {
372375
.all()
373376

374377
if (subTocItems.length > 0) {
375-
currentTocItem.entries = []
376378
console.warn(`${label}: found ${subTocItems.length} sub-TOC items...`)
377379

378380
for (const $subTocItem of subTocItems) {
@@ -385,8 +387,9 @@ async function main() {
385387
const pageNav = await getPageNav()
386388
assert(pageNav)
387389

388-
currentTocItem.entries!.push({
390+
tocItems.push({
389391
label,
392+
depth: 1,
390393
...pageNav
391394
})
392395

@@ -396,8 +399,6 @@ async function main() {
396399
})
397400
}
398401
}
399-
400-
console.warn(currentTocItem)
401402
}
402403

403404
result.toc = tocItems

src/transcribe-book-content.ts

Lines changed: 48 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -3,41 +3,58 @@ import 'dotenv/config'
33
import fs from 'node:fs/promises'
44
import path from 'node:path'
55

6-
import { globby } from 'globby'
76
import { OpenAIClient } from 'openai-fetch'
87
import pMap from 'p-map'
98

10-
import type { ContentChunk } from './types'
9+
import type { BookMetadata, ContentChunk, TocItem } from './types'
1110
import { assert, getEnv } from './utils'
1211

1312
async function main() {
1413
const asin = getEnv('ASIN')
1514
assert(asin, 'ASIN is required')
1615

1716
const outDir = path.join('out', asin)
18-
const pageScreenshotsDir = path.join(outDir, 'pages')
19-
const pageScreenshots = await globby(`${pageScreenshotsDir}/*.png`)
20-
assert(pageScreenshots.length, 'no page screenshots found')
17+
const metadata = JSON.parse(
18+
await fs.readFile(path.join(outDir, 'metadata.json'), 'utf8')
19+
) as BookMetadata
20+
assert(metadata.pages?.length, 'no page screenshots found')
21+
assert(metadata.toc?.length, 'invalid book metadata: missing toc')
22+
23+
// eslint-disable-next-line unicorn/no-array-reduce
24+
const pageToTocItemMap = metadata.toc.reduce(
25+
(acc, tocItem) => {
26+
if (tocItem.page !== undefined) {
27+
acc[tocItem.page] = tocItem
28+
}
29+
return acc
30+
},
31+
{} as Record<number, TocItem>
32+
)
33+
34+
// const pageScreenshotsDir = path.join(outDir, 'pages')
35+
// const pageScreenshots = await globby(`${pageScreenshotsDir}/*.png`)
36+
// assert(pageScreenshots.length, 'no page screenshots found')
2137

2238
const openai = new OpenAIClient()
2339

2440
const content: ContentChunk[] = (
2541
await pMap(
26-
pageScreenshots,
27-
async (screenshot) => {
42+
metadata.pages,
43+
async (pageChunk, pageChunkIndex) => {
44+
const { screenshot, index, page } = pageChunk
2845
const screenshotBuffer = await fs.readFile(screenshot)
2946
const screenshotBase64 = `data:image/png;base64,${screenshotBuffer.toString('base64')}`
30-
const metadataMatch = screenshot.match(/0*(\d+)-\0*(\d+).png/)
31-
assert(
32-
metadataMatch?.[1] && metadataMatch?.[2],
33-
`invalid screenshot filename: ${screenshot}`
34-
)
35-
const index = Number.parseInt(metadataMatch[1]!, 10)
36-
const page = Number.parseInt(metadataMatch[2]!, 10)
37-
assert(
38-
!Number.isNaN(index) && !Number.isNaN(page),
39-
`invalid screenshot filename: ${screenshot}`
40-
)
47+
// const metadataMatch = screenshot.match(/0*(\d+)-\0*(\d+).png/)
48+
// assert(
49+
// metadataMatch?.[1] && metadataMatch?.[2],
50+
// `invalid screenshot filename: ${screenshot}`
51+
// )
52+
// const index = Number.parseInt(metadataMatch[1]!, 10)
53+
// const page = Number.parseInt(metadataMatch[2]!, 10)
54+
// assert(
55+
// !Number.isNaN(index) && !Number.isNaN(page),
56+
// `invalid screenshot filename: ${screenshot}`
57+
// )
4158

4259
try {
4360
const maxRetries = 20
@@ -69,7 +86,7 @@ Do not include any additional text, descriptions, or punctuation. Ignore any emb
6986
})
7087

7188
const rawText = res.choices[0]!.message.content!
72-
const text = rawText
89+
let text = rawText
7390
.replace(/^\s*\d+\s*$\n+/m, '')
7491
// .replaceAll(/\n+/g, '\n')
7592
.replaceAll(/^\s*/gm, '')
@@ -95,6 +112,18 @@ Do not include any additional text, descriptions, or punctuation. Ignore any emb
95112
continue
96113
}
97114

115+
const prevPageChunk = metadata.pages[pageChunkIndex - 1]
116+
if (prevPageChunk && prevPageChunk.page !== page) {
117+
const tocItem = pageToTocItemMap[page]
118+
if (tocItem) {
119+
text = text.replace(
120+
// eslint-disable-next-line security/detect-non-literal-regexp
121+
new RegExp(`^${tocItem.label}\\s*`, 'i'),
122+
''
123+
)
124+
}
125+
}
126+
98127
const result: ContentChunk = {
99128
index,
100129
page,

src/types.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,8 +30,8 @@ export interface PageNav {
3030
export type TocItem = SimplifyDeep<
3131
PageNav & {
3232
label: string
33-
tocPositionId?: number
34-
entries?: Simplify<Omit<TocItem, 'entries'>>[]
33+
// tocPositionId?: number
34+
depth: number
3535
}
3636
>
3737

src/utils.ts

Lines changed: 55 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -7,13 +7,62 @@ import timeFormat from 'hh-mm-ss'
77
import { extract } from 'tar'
88
import { temporaryDirectory } from 'tempy'
99

10-
export {
11-
assert,
12-
getEnv,
13-
normalizeAuthors,
14-
parseJsonpResponse
15-
} from 'kindle-api-ky'
10+
export function assert(
11+
value: unknown,
12+
message?: string | Error
13+
): asserts value {
14+
if (value) {
15+
return
16+
}
17+
18+
if (!message) {
19+
throw new Error('Assertion failed')
20+
}
21+
22+
throw typeof message === 'string' ? new Error(message) : message
23+
}
24+
25+
export function getEnv(name: string): string | undefined {
26+
try {
27+
return typeof process !== 'undefined'
28+
? // eslint-disable-next-line no-process-env
29+
process.env?.[name]
30+
: undefined
31+
} catch {
32+
return undefined
33+
}
34+
}
35+
36+
export function normalizeAuthors(rawAuthors: string[]): string[] {
37+
if (!rawAuthors?.length) {
38+
return []
39+
}
40+
41+
const rawAuthor = rawAuthors[0]!
42+
43+
return Array.from(new Set(rawAuthor.split(':').filter(Boolean)), (authors) =>
44+
authors
45+
.split(',')
46+
.map((elems) => elems.trim())
47+
.toReversed()
48+
.join(' ')
49+
)
50+
}
51+
52+
const JSONP_REGEX = /\(({.*})\)/
1653

54+
export function parseJsonpResponse<T = unknown>(body: string): T | undefined {
55+
const content = body?.match(JSONP_REGEX)?.[1]
56+
if (!content) {
57+
return
58+
}
59+
60+
try {
61+
return JSON.parse(content) as T
62+
} catch {
63+
return
64+
}
65+
}
1766
const numerals = { I: 1, V: 5, X: 10, L: 50, C: 100, D: 500, M: 1000 }
1867

1968
export function deromanize(romanNumeral: string): number {

0 commit comments

Comments
 (0)