Skip to content

Commit 3aa4583

Browse files
Update GitHub Actions workflow for scraping Codex links
1 parent 78daecd commit 3aa4583

File tree

1 file changed

+50
-7
lines changed

1 file changed

+50
-7
lines changed

.github/workflows/scrape_codex.yml

Lines changed: 50 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,47 @@
1+
name: Scrape Codex Links
2+
3+
on:
4+
workflow_dispatch:
5+
push:
6+
paths:
7+
- "prompts/**/*.md"
8+
branches:
9+
- main
10+
11+
jobs:
12+
scrape:
13+
runs-on: ubuntu-latest
14+
15+
steps:
16+
- name: Checkout repo
17+
uses: actions/checkout@v4
18+
19+
- name: Setup Node.js
20+
uses: actions/setup-node@v4
21+
with:
22+
node-version: 20
23+
24+
- name: Install dependencies
25+
run: |
26+
sudo apt-get update
27+
sudo apt-get install -y xvfb jq
28+
npm install [email protected]
29+
30+
- name: Find new Codex links
31+
id: find
32+
run: |
33+
links=$(grep -Eroh 'https://chatgpt.com/s/[A-Za-z0-9_]+' prompts || true)
34+
if [ -z "$links" ]; then
35+
echo "No Codex links found."
36+
exit 0
37+
fi
38+
echo "links<<EOF" >> $GITHUB_OUTPUT
39+
echo "$links" >> $GITHUB_OUTPUT
40+
echo "EOF" >> $GITHUB_OUTPUT
41+
142
- name: Scrape and append prompt text
243
if: steps.find.outputs.links != ''
344
run: |
4-
# Inline Puppeteer scraper
545
cat > scrape.js <<'EOF'
646
import puppeteer from 'puppeteer';
747
const [,, url] = process.argv;
@@ -12,15 +52,11 @@
1252
});
1353
const page = await browser.newPage();
1454
await page.goto(url, { waitUntil: 'networkidle2', timeout: 120000 });
15-
16-
// Delay for hydration (waitForTimeout removed in newer Puppeteer)
17-
await new Promise(r => setTimeout(r, 5000));
18-
55+
await new Promise(r => setTimeout(r, 5000)); // delay for hydration
1956
const text = await page.evaluate(() => {
2057
const container = document.querySelector('main') || document.body;
2158
return container.innerText.trim();
2259
});
23-
2460
console.log(text.slice(0, 3000));
2561
await browser.close();
2662
})();
@@ -50,5 +86,12 @@
5086
done
5187
done <<< "$(echo "${{ steps.find.outputs.links }}")"
5288
53-
# Clean up temp files to avoid 'untracked files' noise
5489
rm -rf node_modules package-lock.json package.json scrape.js
90+
91+
- name: Commit and push results
92+
run: |
93+
git config user.name "github-actions[bot]"
94+
git config user.email "41898282+github-actions[bot]@users.noreply.github.com"
95+
git add prompts
96+
git commit -m "Auto-scraped Codex prompts" || echo "No new content to commit."
97+
git push

0 commit comments

Comments
 (0)