forked from webrecorder/browsertrix-crawler
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdefaultDriver.js
89 lines (68 loc) · 2.05 KB
/
defaultDriver.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
const fs = require("fs");
const autoplayScript = fs.readFileSync("/app/autoplay.js", "utf-8");
//const autoplayScript = require("/app/autoplay.js");
/* eslint-disable no-undef */
module.exports = async ({data, page, crawler}) => {
const {url} = data;
//page.on("requestfailed", message => console.warn(message._failureText));
if (!await crawler.isHTML(url)) {
await crawler.directFetchCapture(url);
return;
}
const mediaResults = [];
await page.exposeFunction("__crawler_queueUrls", async (url) => {
mediaResults.push(await crawler.directFetchCapture(url));
});
let waitForVideo = false;
await page.exposeFunction("__crawler_autoplayLoad", (url) => {
console.log("*** Loading autoplay URL: " + url);
waitForVideo = true;
});
try {
await page.evaluateOnNewDocument(autoplayScript);
} catch(e) {
console.log(e);
}
const gotoOpts = {
waitUntil: crawler.params.waitUntil,
timeout: crawler.params.timeout
};
try {
await page.goto(url, gotoOpts);
} catch (e) {
console.log(`Load timeout for ${url}`, e);
}
try {
await Promise.all(mediaResults);
} catch (e) {
console.log("Error loading media URLs", e);
}
if (waitForVideo) {
console.log("Extra wait 15s for video loading");
await crawler.sleep(15000);
}
if (crawler.params.scroll) {
try {
await Promise.race([page.evaluate(autoScroll), crawler.sleep(30000)]);
} catch (e) {
console.warn("Behavior Failed", e);
}
}
await crawler.extractLinks(page, "a[href]");
};
async function autoScroll() {
const canScrollMore = () =>
self.scrollY + self.innerHeight <
Math.max(
self.document.body.scrollHeight,
self.document.body.offsetHeight,
self.document.documentElement.clientHeight,
self.document.documentElement.scrollHeight,
self.document.documentElement.offsetHeight
);
const scrollOpts = { top: 250, left: 0, behavior: "auto" };
while (canScrollMore()) {
self.scrollBy(scrollOpts);
await new Promise(resolve => setTimeout(resolve, 500));
}
}