Skip to content

Commit f4501c8

Browse files
Refactor Codex scraping workflow
Refactor the Codex scraping workflow to improve link handling and cleanup.
1 parent 0c1b1e7 commit f4501c8

File tree

1 file changed

+26
-64
lines changed

1 file changed

+26
-64
lines changed

.github/workflows/scrape_codex.yml

Lines changed: 26 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -1,92 +1,54 @@
1-
name: Scrape Codex Links
2-
3-
on:
4-
workflow_dispatch:
5-
push:
6-
paths:
7-
- "prompts/**/*.md"
8-
branches:
9-
- main
10-
11-
jobs:
12-
scrape:
13-
runs-on: ubuntu-latest
14-
15-
steps:
16-
- name: Checkout repo
17-
uses: actions/checkout@v4
18-
19-
- name: Setup Node.js
20-
uses: actions/setup-node@v4
21-
with:
22-
node-version: 20
23-
24-
- name: Install dependencies
25-
run: |
26-
sudo apt-get update
27-
sudo apt-get install -y xvfb jq
28-
npm install [email protected]
29-
30-
- name: Find new Codex links
31-
id: find
32-
run: |
33-
links=$(grep -Eroh 'https://chatgpt.com/s/[A-Za-z0-9_]+' prompts || true)
34-
if [ -z "$links" ]; then
35-
echo "No Codex links found."
36-
exit 0
37-
fi
38-
echo "links<<EOF" >> $GITHUB_OUTPUT
39-
echo "$links" >> $GITHUB_OUTPUT
40-
echo "EOF" >> $GITHUB_OUTPUT
41-
421
- name: Scrape and append prompt text
432
if: steps.find.outputs.links != ''
443
run: |
4+
# Inline Puppeteer scraper
455
cat > scrape.js <<'EOF'
466
import puppeteer from 'puppeteer';
477
const [,, url] = process.argv;
488
(async () => {
499
const browser = await puppeteer.launch({
5010
headless: false,
51-
args: [
52-
'--no-sandbox',
53-
'--disable-setuid-sandbox',
54-
'--disable-gpu'
55-
]
11+
args: ['--no-sandbox', '--disable-setuid-sandbox', '--disable-gpu']
5612
});
5713
const page = await browser.newPage();
5814
await page.goto(url, { waitUntil: 'networkidle2', timeout: 120000 });
59-
await page.waitForTimeout(5000); // give JS time to render
15+
16+
// Delay for hydration (waitForTimeout removed in newer Puppeteer)
17+
await new Promise(r => setTimeout(r, 5000));
18+
6019
const text = await page.evaluate(() => {
6120
const container = document.querySelector('main') || document.body;
6221
return container.innerText.trim();
6322
});
23+
6424
console.log(text.slice(0, 3000));
6525
await browser.close();
6626
})();
6727
EOF
6828
6929
while read -r link; do
7030
echo "Processing $link..."
71-
file=$(grep -rl "$link" prompts | head -n 1)
72-
if grep -q "Extracted Prompt" "$file"; then
73-
echo "Already scraped: $file"
31+
files=$(grep -rl "$link" prompts || true)
32+
if [ -z "$files" ]; then
33+
echo "⚠️ No file found containing link: $link"
7434
continue
7535
fi
7636
77-
content=$(xvfb-run -a node scrape.js "$link" || true)
78-
if [ -n "$content" ]; then
79-
echo -e "\n\n---\n\n### Extracted Prompt\n$content" >> "$file"
80-
echo "✅ Appended to $file"
81-
else
82-
echo "⚠️ No prompt extracted for $link"
83-
fi
37+
for file in $files; do
38+
if grep -q "Extracted Prompt" "$file"; then
39+
echo "Already scraped: $file"
40+
continue
41+
fi
42+
43+
content=$(xvfb-run -a node scrape.js "$link" || true)
44+
if [ -n "$content" ]; then
45+
echo -e "\n\n---\n\n### Extracted Prompt\n$content" >> "$file"
46+
echo "✅ Appended to $file"
47+
else
48+
echo "⚠️ No prompt extracted for $link"
49+
fi
50+
done
8451
done <<< "$(echo "${{ steps.find.outputs.links }}")"
8552
86-
- name: Commit and push results
87-
run: |
88-
git config user.name "github-actions[bot]"
89-
git config user.email "41898282+github-actions[bot]@users.noreply.github.com"
90-
git add prompts
91-
git commit -m "Auto-scraped Codex prompts" || echo "No new content to commit."
92-
git push
53+
# Clean up temp files to avoid 'untracked files' noise
54+
rm -rf node_modules package-lock.json package.json scrape.js

0 commit comments

Comments
 (0)