Refactor Codex scraping workflow

jessewashburn · web-flow · commit f4501c82dc9a · 2025-10-17T13:01:53.000-04:00
Refactor the Codex scraping workflow to improve link handling and cleanup.
diff --git a/.github/workflows/scrape_codex.yml b/.github/workflows/scrape_codex.yml
@@ -1,92 +1,54 @@
-name: Scrape Codex Links
-
-on:
-  workflow_dispatch:
-  push:
-    paths:
-      - "prompts/**/*.md"
-    branches:
-      - main
-
-jobs:
-  scrape:
-    runs-on: ubuntu-latest
-
-    steps:
-      - name: Checkout repo
-        uses: actions/checkout@v4
-
-      - name: Setup Node.js
-        uses: actions/setup-node@v4
-        with:
-          node-version: 20
-
-      - name: Install dependencies
-        run: |
-          sudo apt-get update
-          sudo apt-get install -y xvfb jq
-          npm install puppeteer@24.2.0
-
-      - name: Find new Codex links
-        id: find
-        run: |
-          links=$(grep -Eroh 'https://chatgpt.com/s/[A-Za-z0-9_]+' prompts || true)
-          if [ -z "$links" ]; then
-            echo "No Codex links found."
-            exit 0
-          fi
-          echo "links<<EOF" >> $GITHUB_OUTPUT
-          echo "$links" >> $GITHUB_OUTPUT
-          echo "EOF" >> $GITHUB_OUTPUT
-
       - name: Scrape and append prompt text
         if: steps.find.outputs.links != ''
         run: |
+          # Inline Puppeteer scraper
           cat > scrape.js <<'EOF'
           import puppeteer from 'puppeteer';
           const [,, url] = process.argv;
           (async () => {
             const browser = await puppeteer.launch({
               headless: false,
-              args: [
-                '--no-sandbox',
-                '--disable-setuid-sandbox',
-                '--disable-gpu'
-              ]
+              args: ['--no-sandbox', '--disable-setuid-sandbox', '--disable-gpu']
             });
             const page = await browser.newPage();
             await page.goto(url, { waitUntil: 'networkidle2', timeout: 120000 });
-            await page.waitForTimeout(5000); // give JS time to render
+
+            // Delay for hydration (waitForTimeout removed in newer Puppeteer)
+            await new Promise(r => setTimeout(r, 5000));
+
             const text = await page.evaluate(() => {
               const container = document.querySelector('main') || document.body;
               return container.innerText.trim();
             });
+
             console.log(text.slice(0, 3000));
             await browser.close();
           })();
           EOF
 
           while read -r link; do
             echo "Processing $link..."
-            file=$(grep -rl "$link" prompts | head -n 1)
-            if grep -q "Extracted Prompt" "$file"; then
-              echo "Already scraped: $file"
+            files=$(grep -rl "$link" prompts || true)
+            if [ -z "$files" ]; then
+              echo "⚠️ No file found containing link: $link"
               continue
             fi
 
-            content=$(xvfb-run -a node scrape.js "$link" || true)
-            if [ -n "$content" ]; then
-              echo -e "\n\n---\n\n### Extracted Prompt\n$content" >> "$file"
-              echo "✅ Appended to $file"
-            else
-              echo "⚠️ No prompt extracted for $link"
-            fi
+            for file in $files; do
+              if grep -q "Extracted Prompt" "$file"; then
+                echo "Already scraped: $file"
+                continue
+              fi
+
+              content=$(xvfb-run -a node scrape.js "$link" || true)
+              if [ -n "$content" ]; then
+                echo -e "\n\n---\n\n### Extracted Prompt\n$content" >> "$file"
+                echo "✅ Appended to $file"
+              else
+                echo "⚠️ No prompt extracted for $link"
+              fi
+            done
           done <<< "$(echo "${{ steps.find.outputs.links }}")"
 
-      - name: Commit and push results
-        run: |
-          git config user.name "github-actions[bot]"
-          git config user.email "41898282+github-actions[bot]@users.noreply.github.com"
-          git add prompts
-          git commit -m "Auto-scraped Codex prompts" || echo "No new content to commit."
-          git push
+          # Clean up temp files to avoid 'untracked files' noise
+          rm -rf node_modules package-lock.json package.json scrape.js