🦅

transitive-bullshit · transitive-bullshit · commit 5bc2e0c4e0e7 · 2025-10-22T15:08:31.000+07:00
diff --git a/src/extract-kindle-book.ts b/src/extract-kindle-book.ts
@@ -12,7 +12,14 @@ import sharp from 'sharp'
 
 import type { BookMetadata, TocItem } from './types'
 import { parsePageNav, parseTocItems } from './playwright-utils'
-import { assert, getEnv, normalizeAuthors, parseJsonpResponse } from './utils'
+import {
+  assert,
+  // extractTar,
+  getEnv,
+  // hashObject,
+  normalizeAuthors,
+  parseJsonpResponse
+} from './utils'
 
 // Block amazon analytics requests
 // (not strictly necessary, but adblockers do this by default anyway and it
@@ -148,8 +155,20 @@ async function main() {
           result.info = body
         } else if (url.pathname === '/renderer/render') {
           // TODO: these TAR files have some useful metadata that we could use...
+          // const params = Object.fromEntries(url.searchParams.entries())
+          // const hash = hashObject(params)
+          // const renderDir = path.join(userDataDir, 'render', hash)
+          // await fs.mkdir(renderDir, { recursive: true })
           // const body = await response.body()
-          // const tempDir = await extractTarToTemp(body)
+          // const tempDir = await extractTar(body, { cwd: renderDir })
+          // const { startingPosition, skipPageCount, numPage } = params
+          // console.log('RENDER TAR', tempDir, {
+          //   startingPosition,
+          //   skipPageCount,
+          //   numPage
+          // })
+          // TODO: if `location_map.json` exists, record `navigationUnit` map of positions to pages
+          // TODO: `page_data_0_5.json` has start/end/words for each page in this render batch
           // const toc = JSON.parse(
           //   await fs.readFile(path.join(tempDir, 'toc.json'), 'utf8')
           // )
diff --git a/src/utils.ts b/src/utils.ts
@@ -137,25 +137,27 @@ export function ffmpegOnProgress(
  * Decompress a TAR (optionally .tar.gz/.tgz) Buffer to a fresh temp directory.
  * Returns the absolute path of the temp directory.
  */
-export async function extractTarToTemp(
+export async function extractTar(
   buf: Buffer,
-  opts: { strip?: number } = {}
+  {
+    strip = 0,
+    cwd = temporaryDirectory()
+  }: { strip?: number; cwd?: string } = {}
 ): Promise<string> {
-  const dir = temporaryDirectory()
   const isGzip = buf.length >= 2 && buf[0] === 0x1f && buf[1] === 0x8b
 
   try {
     const extractor = extract({
-      cwd: dir,
+      cwd,
       gzip: isGzip,
-      strip: opts.strip ?? 0 // remove leading path segments if desired
+      strip
     })
 
     await pipeline(Readable.from(buf), extractor)
-    return dir
+    return cwd
   } catch (err) {
     // Clean up the temp dir if extraction fails
-    await fs.rm(dir, { recursive: true, force: true }).catch(() => {})
+    await fs.rm(cwd, { recursive: true, force: true }).catch(() => {})
     throw err
   }
 }
diff --git a/todo.md b/todo.md
@@ -1,4 +1,4 @@
-- extraction
+- `extract-kindle-book`
   - extract raw images (product image, etc)
   - special-case handling of pages with only an image child
   - handle rich-text / markdown

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-- extraction`
	`1`	+- `extract-kindle-book`
`2`	`2`	`- extract raw images (product image, etc)`
`3`	`3`	`- special-case handling of pages with only an image child`
`4`	`4`	`- handle rich-text / markdown`