|
1 | | -name: Scrape Codex Links |
2 | | - |
3 | | -on: |
4 | | - workflow_dispatch: |
5 | | - push: |
6 | | - paths: |
7 | | - - "prompts/**/*.md" |
8 | | - branches: |
9 | | - - main |
10 | | - |
11 | | -jobs: |
12 | | - scrape: |
13 | | - runs-on: ubuntu-latest |
14 | | - |
15 | | - steps: |
16 | | - - name: Checkout repo |
17 | | - uses: actions/checkout@v4 |
18 | | - |
19 | | - - name: Setup Node.js |
20 | | - uses: actions/setup-node@v4 |
21 | | - with: |
22 | | - node-version: 20 |
23 | | - |
24 | | - - name: Install dependencies |
25 | | - run: | |
26 | | - sudo apt-get update |
27 | | - sudo apt-get install -y xvfb jq |
28 | | - |
29 | | -
|
30 | | - - name: Find new Codex links |
31 | | - id: find |
32 | | - run: | |
33 | | - links=$(grep -Eroh 'https://chatgpt.com/s/[A-Za-z0-9_]+' prompts || true) |
34 | | - if [ -z "$links" ]; then |
35 | | - echo "No Codex links found." |
36 | | - exit 0 |
37 | | - fi |
38 | | - echo "links<<EOF" >> $GITHUB_OUTPUT |
39 | | - echo "$links" >> $GITHUB_OUTPUT |
40 | | - echo "EOF" >> $GITHUB_OUTPUT |
41 | | -
|
42 | 1 | - name: Scrape and append prompt text |
43 | 2 | if: steps.find.outputs.links != '' |
44 | 3 | run: | |
| 4 | + # Inline Puppeteer scraper |
45 | 5 | cat > scrape.js <<'EOF' |
46 | 6 | import puppeteer from 'puppeteer'; |
47 | 7 | const [,, url] = process.argv; |
48 | 8 | (async () => { |
49 | 9 | const browser = await puppeteer.launch({ |
50 | 10 | headless: false, |
51 | | - args: [ |
52 | | - '--no-sandbox', |
53 | | - '--disable-setuid-sandbox', |
54 | | - '--disable-gpu' |
55 | | - ] |
| 11 | + args: ['--no-sandbox', '--disable-setuid-sandbox', '--disable-gpu'] |
56 | 12 | }); |
57 | 13 | const page = await browser.newPage(); |
58 | 14 | await page.goto(url, { waitUntil: 'networkidle2', timeout: 120000 }); |
59 | | - await page.waitForTimeout(5000); // give JS time to render |
| 15 | +
|
| 16 | + // Delay for hydration (waitForTimeout removed in newer Puppeteer) |
| 17 | + await new Promise(r => setTimeout(r, 5000)); |
| 18 | +
|
60 | 19 | const text = await page.evaluate(() => { |
61 | 20 | const container = document.querySelector('main') || document.body; |
62 | 21 | return container.innerText.trim(); |
63 | 22 | }); |
| 23 | +
|
64 | 24 | console.log(text.slice(0, 3000)); |
65 | 25 | await browser.close(); |
66 | 26 | })(); |
67 | 27 | EOF |
68 | 28 |
|
69 | 29 | while read -r link; do |
70 | 30 | echo "Processing $link..." |
71 | | - file=$(grep -rl "$link" prompts | head -n 1) |
72 | | - if grep -q "Extracted Prompt" "$file"; then |
73 | | - echo "Already scraped: $file" |
| 31 | + files=$(grep -rl "$link" prompts || true) |
| 32 | + if [ -z "$files" ]; then |
| 33 | + echo "⚠️ No file found containing link: $link" |
74 | 34 | continue |
75 | 35 | fi |
76 | 36 |
|
77 | | - content=$(xvfb-run -a node scrape.js "$link" || true) |
78 | | - if [ -n "$content" ]; then |
79 | | - echo -e "\n\n---\n\n### Extracted Prompt\n$content" >> "$file" |
80 | | - echo "✅ Appended to $file" |
81 | | - else |
82 | | - echo "⚠️ No prompt extracted for $link" |
83 | | - fi |
| 37 | + for file in $files; do |
| 38 | + if grep -q "Extracted Prompt" "$file"; then |
| 39 | + echo "Already scraped: $file" |
| 40 | + continue |
| 41 | + fi |
| 42 | +
|
| 43 | + content=$(xvfb-run -a node scrape.js "$link" || true) |
| 44 | + if [ -n "$content" ]; then |
| 45 | + echo -e "\n\n---\n\n### Extracted Prompt\n$content" >> "$file" |
| 46 | + echo "✅ Appended to $file" |
| 47 | + else |
| 48 | + echo "⚠️ No prompt extracted for $link" |
| 49 | + fi |
| 50 | + done |
84 | 51 | done <<< "$(echo "${{ steps.find.outputs.links }}")" |
85 | 52 |
|
86 | | - - name: Commit and push results |
87 | | - run: | |
88 | | - git config user.name "github-actions[bot]" |
89 | | - git config user.email "41898282+github-actions[bot]@users.noreply.github.com" |
90 | | - git add prompts |
91 | | - git commit -m "Auto-scraped Codex prompts" || echo "No new content to commit." |
92 | | - git push |
| 53 | + # Clean up temp files to avoid 'untracked files' noise |
| 54 | + rm -rf node_modules package-lock.json package.json scrape.js |
0 commit comments