Skip to content

Commit 0e970df

Browse files
author
Drew McMillan
committed
more
1 parent 669ccc9 commit 0e970df

File tree

11 files changed

+2483
-229
lines changed

11 files changed

+2483
-229
lines changed

example.json

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,11 @@
11
{
2-
"url": "https://www.example.com",
3-
"maxDepth": 1,
4-
"maxChromeInstances": 5,
5-
"limit": "/music/",
2+
"url": "https://www.bbc.co.uk/news",
3+
"maxDepth": 2,
4+
"maxChromeInstances": 10,
5+
"limit": "/news/",
66
"httpsOnly": true,
77
"showHttpLinksDuring": false,
88
"showHttpLinksAfter": true,
9-
"userAgent": "light-mc-crawler Mixed Content Crawler"
9+
"chromeFlags": ["--show-paint-rects", "--no-sandbox", "--user-data-dir", "--headless", "--disable-setuid-sandbox", "--disable-gpu", "--disable-dev-shm-usage"],
10+
"userAgent": "light-mc-crawler - News Discovery - Mixed Content Crawler"
1011
}

index.js

Lines changed: 33 additions & 224 deletions
Original file line numberDiff line numberDiff line change
@@ -1,244 +1,53 @@
1-
const cheerio = require('cheerio')
2-
const ChildProcess = require('child_process')
3-
const Crawler = require('simplecrawler')
4-
const path = require('path')
5-
const queue = require('async/queue')
6-
const fs = require('fs')
7-
const colors = require('colors')
8-
const util = require('util')
9-
10-
const stats = {
1+
const cheerio = require('cheerio'),
2+
queue = require('./src/queue'),
3+
path = require('path'),
4+
fs = require('fs'),
5+
Crawler = require('./src/crawler'),
6+
printStats = require('./src/log/final');
7+
8+
let config, stats = {
119
pageCount: 0,
12-
violationCounts: {},
13-
foundHttpLinks: {},
14-
passedAuditsCount: 0,
15-
startTime: null,
16-
auditTimesByPageUrl: {}
10+
totalErrorCount: 0,
11+
startTime: new Date()
12+
};
13+
14+
function discoverResources(buffer, item) {
15+
const page = cheerio.load(buffer.toString('utf8'))
16+
var links = page('a[href]').map(function () {
17+
return page(this).attr('href')
18+
}).get()
19+
20+
if(config.limit){
21+
links = links.filter(function(s){
22+
return ~s.indexOf(config.limit);
23+
});
24+
}
25+
26+
return links
1727
}
1828

1929
module.exports = (options) => {
2030
console.log("ô¿ô light-mc-crawler has started crawling. If it looks like nothing is happening, wait, it is :)");
2131

22-
stats.startTime = new Date()
32+
config = JSON.parse(fs.readFileSync(path.resolve(options.config)))
2333

24-
const configPath = path.resolve(options.config)
25-
const config = JSON.parse(fs.readFileSync(configPath))
34+
const lighthouseQueue = queue(config, stats);
35+
const crawler = Crawler(config);
2636

27-
const crawler = new Crawler(options.url || config.url)
28-
crawler.respectRobotsTxt = false
29-
crawler.parseHTMLComments = false
30-
crawler.parseScriptTags = false
31-
crawler.userAgent = options.userAgent || "light-mc-crawler Mixed Content Crawler"
32-
crawler.maxDepth = config.maxDepth || 1
33-
34-
35-
crawler.discoverResources = (buffer, item) => {
36-
const page = cheerio.load(buffer.toString('utf8'))
37-
var links = page('a[href]').map(function () {
38-
return page(this).attr('href')
39-
}).get()
40-
41-
if(config.limit){
42-
links = links.filter(function(s){
43-
return ~s.indexOf(config.limit);
44-
});
45-
}
46-
47-
if(config.showHttpLinksDuring || config.showHttpLinksAfter){
48-
links.forEach(function(link) {
49-
if(link.indexOf('http://') !== -1){
50-
if(!stats.foundHttpLinks[item.url]){
51-
stats.foundHttpLinks[item.url] = [];
52-
}
53-
54-
stats.foundHttpLinks[item.url].push(link)
55-
}
56-
});
57-
58-
if(config.showHttpLinksDuring && stats.foundHttpLinks[item.url]){
59-
console.log();
60-
console.log('Http link(s) on '.bold.underline + item.url.bold.underline);
61-
stats.foundHttpLinks[item.url].forEach(function(link) {
62-
console.log(' ' + link);
63-
});
64-
}
65-
}
66-
67-
return links
68-
}
69-
70-
let totalErrorCount = 0
71-
72-
const lighthouseQueue = queue((url, callback) => {
73-
runLighthouse(url, config, (errorCount) => {
74-
totalErrorCount += errorCount
75-
callback()
76-
})
77-
}, config.maxChromeInstances || 5)
37+
crawler.discoverResources = discoverResources;
7838

7939
crawler.on('fetchcomplete', (queueItem, responseBuffer, response) => {
8040
lighthouseQueue.push(queueItem.url)
81-
})
41+
});
8242

8343
crawler.once('complete', () => {
8444
lighthouseQueue.drain = () => {
85-
printStats(config)
86-
if (totalErrorCount > 0) {
45+
printStats(stats)
46+
if (stats.totalErrorCount > 0) {
8747
process.exit(1)
8848
}
8949
}
9050
})
9151

9252
crawler.start()
93-
}
94-
95-
function runLighthouse (url, config, callback) {
96-
if(config.httpsOnly){
97-
url = url.replace("http://", "https://");
98-
}
99-
100-
stats.pageCount++
101-
var mixedContent = require.resolve('lighthouse/lighthouse-core/config/mixed-content.js')
102-
var chromeFlags = config.chromeFlags || '--headless --disable-gpu';
103-
var userAgent = config.userAgent || 'light-mc-crawler Mixed Content Crawler'
104-
const args = [
105-
url,
106-
'--output=json',
107-
'--output-path=stdout',
108-
'--disable-device-emulation',
109-
'--disable-cpu-throttling',
110-
'--disable-storage-reset',
111-
'--disable-network-throttling',
112-
'--chrome-flags=' + chromeFlags + '--user-agent=' + userAgent,
113-
`--config-path=${mixedContent}`
114-
]
115-
116-
const lighthousePath = require.resolve('lighthouse/lighthouse-cli/index.js')
117-
const lighthouse = ChildProcess.spawn(lighthousePath, args)
118-
119-
let output = ''
120-
lighthouse.stdout.on('data', (data) => {
121-
output += data
122-
})
123-
124-
stats.auditTimesByPageUrl[url] = {startTime: new Date()}
125-
lighthouse.once('close', () => {
126-
stats.auditTimesByPageUrl[url].endTime = new Date()
127-
let errorCount = 0
128-
129-
let report
130-
try {
131-
report = JSON.parse(output)
132-
} catch (parseError) {
133-
console.log();
134-
if(output != ''){
135-
console.error(`Parsing JSON report output failed for ${url}: ${output}`);
136-
console.log(parseError);
137-
} else{
138-
console.error(`Lighthouse report returned nothing for ${url}`);
139-
}
140-
141-
callback(1)
142-
return
143-
}
144-
145-
report.reportCategories.forEach((category) => {
146-
let displayedCategory = false
147-
category.audits.forEach((audit) => {
148-
if(audit.id != "is-on-https"){
149-
//mixed-content is buggy atm, will work on fixing.
150-
//is-on-https seems to surface everything well enough
151-
return;
152-
}
153-
154-
if (audit.score === 100) {
155-
stats.passedAuditsCount++
156-
} else {
157-
if (!displayedCategory) {
158-
console.log();
159-
console.log(category.name.bold.underline + ` current page count: ${stats.pageCount}`);
160-
displayedCategory = true
161-
}
162-
errorCount++
163-
console.log(url.replace(/\/$/, ''), '\u2717'.red, audit.id.bold, '-', audit.result.description.italic)
164-
165-
if (stats.violationCounts[category.name] === undefined) {
166-
stats.violationCounts[category.name] = 0
167-
}
168-
169-
if (audit.result.extendedInfo) {
170-
const {value} = audit.result.extendedInfo
171-
if (Array.isArray(value)) {
172-
stats.violationCounts[category.name] += value.length
173-
value.forEach((result) => {
174-
if (result.url) {
175-
console.log(` ${result.url}`)
176-
}
177-
})
178-
} else if (Array.isArray(value.nodes)) {
179-
stats.violationCounts[category.name] += value.nodes.length
180-
const messagesToNodes = {}
181-
value.nodes.forEach((result) => {
182-
let message = result.failureSummary
183-
message = message.replace(/^Fix any of the following:/g, '').trim()
184-
if (messagesToNodes[message]) {
185-
messagesToNodes[message].push(result.html)
186-
} else {
187-
messagesToNodes[message] = [result.html]
188-
}
189-
})
190-
Object.keys(messagesToNodes).forEach((message) => {
191-
console.log(` ${message}`)
192-
messagesToNodes[message].forEach(node => {
193-
console.log(` ${node}`.gray)
194-
})
195-
})
196-
} else {
197-
stats.violationCounts[category.name]++
198-
}
199-
}else if(audit.result.details && audit.result.details.items){
200-
audit.result.details.items.forEach((result) => {
201-
if (result[0].text) {
202-
console.log(` ${result[0].text}`)
203-
}
204-
})
205-
}
206-
}
207-
})
208-
})
209-
210-
callback(errorCount)
211-
})
212-
}
213-
214-
function printStats(config) {
215-
console.log();
216-
console.log();
217-
if(config.showHttpLinksAfter){
218-
for(var index in stats.foundHttpLinks) {
219-
console.log('Http link(s) on '.bold.underline + index.bold.underline);
220-
stats.foundHttpLinks[index].forEach(function(link) {
221-
console.log(' ' + link);
222-
});
223-
}
224-
}
225-
console.log();
226-
console.log();
227-
console.log('Lighthouse Summary'.bold.underline);
228-
console.log(` Total Pages Scanned: ${stats.pageCount}`);
229-
console.log(` Total Auditing Time: ${new Date() - stats.startTime} ms`);
230-
const totalTime = Object.keys(stats.auditTimesByPageUrl).reduce((sum, url) => {
231-
const {endTime, startTime} = stats.auditTimesByPageUrl[url]
232-
return (endTime - startTime) + sum
233-
}, 0)
234-
console.log(` Average Page Audit Time: ${Math.round(totalTime/stats.pageCount)} ms`);
235-
console.log(` Total Audits Passed: ${stats.passedAuditsCount}`, '\u2713'.green);
236-
if (Object.keys(stats.violationCounts).length === 0) {
237-
console.log(` Total Violations: None! \\o/ 🎉`);
238-
} else {
239-
console.log(` Total Violations:`);
240-
Object.keys(stats.violationCounts).forEach(category => {
241-
console.log(` ${category}: ${stats.violationCounts[category]}`, '\u2717'.red);
242-
})
243-
}
244-
}
53+
}

0 commit comments

Comments
 (0)