Skip to content

Commit ee9c9ea

Browse files
author
Drew McMillan
committed
more
1 parent 35a66fe commit ee9c9ea

File tree

11 files changed

+2479
-237
lines changed

11 files changed

+2479
-237
lines changed

example.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,8 @@
44
"maxChromeInstances": 10,
55
"limit": "/news/",
66
"httpsOnly": true,
7-
"showHttpLinksDuring": true,
7+
"showHttpLinksDuring": false,
88
"showHttpLinksAfter": true,
9-
"chromeFlags": "--no-sandbox --user-data-dir --headless --disable-setuid-sandbox --disable-gpu",
9+
"chromeFlags": ["--show-paint-rects", "--no-sandbox", "--user-data-dir", "--headless", "--disable-setuid-sandbox", "--disable-gpu", "--disable-dev-shm-usage"],
1010
"userAgent": "light-mc-crawler - News Discovery - Mixed Content Crawler"
1111
}

index.js

Lines changed: 33 additions & 235 deletions
Original file line numberDiff line numberDiff line change
@@ -1,255 +1,53 @@
1-
const cheerio = require('cheerio')
2-
const ChildProcess = require('child_process')
3-
const Crawler = require('simplecrawler')
4-
const path = require('path')
5-
const queue = require('async/queue')
6-
const fs = require('fs')
7-
const colors = require('colors')
8-
const util = require('util')
9-
10-
const stats = {
1+
const cheerio = require('cheerio'),
2+
queue = require('./src/queue'),
3+
path = require('path'),
4+
fs = require('fs'),
5+
Crawler = require('./src/crawler'),
6+
printStats = require('./src/log/final');
7+
8+
let config, stats = {
119
pageCount: 0,
12-
violationCounts: {},
13-
foundHttpLinks: {},
14-
passedAuditsCount: 0,
15-
startTime: null,
16-
auditTimesByPageUrl: {}
10+
totalErrorCount: 0,
11+
startTime: new Date()
12+
};
13+
14+
function discoverResources(buffer, item) {
15+
const page = cheerio.load(buffer.toString('utf8'))
16+
var links = page('a[href]').map(function () {
17+
return page(this).attr('href')
18+
}).get()
19+
20+
if(config.limit){
21+
links = links.filter(function(s){
22+
return ~s.indexOf(config.limit);
23+
});
24+
}
25+
26+
return links
1727
}
1828

1929
module.exports = (options) => {
2030
console.log("ô¿ô light-mc-crawler has started crawling. If it looks like nothing is happening, wait, it is :)");
2131

22-
stats.startTime = new Date()
23-
24-
const configPath = path.resolve(options.config)
25-
const config = JSON.parse(fs.readFileSync(configPath))
26-
27-
const crawler = new Crawler(options.url || config.url)
28-
crawler.respectRobotsTxt = false
29-
crawler.parseHTMLComments = false
30-
crawler.parseScriptTags = false
31-
crawler.userAgent = options.userAgent || "light-mc-crawler Mixed Content Crawler"
32-
crawler.maxDepth = config.maxDepth || 1
33-
34-
35-
crawler.discoverResources = (buffer, item) => {
36-
const page = cheerio.load(buffer.toString('utf8'))
37-
var links = page('a[href]').map(function () {
38-
return page(this).attr('href')
39-
}).get()
40-
41-
if(config.limit){
42-
links = links.filter(function(s){
43-
return ~s.indexOf(config.limit);
44-
});
45-
}
46-
47-
if(config.showHttpLinksDuring || config.showHttpLinksAfter){
48-
links.forEach(function(link) {
49-
if(link.indexOf('http://') !== -1){
50-
if(!stats.foundHttpLinks[item.url]){
51-
stats.foundHttpLinks[item.url] = [];
52-
}
53-
54-
stats.foundHttpLinks[item.url].push(link)
55-
}
56-
});
32+
config = JSON.parse(fs.readFileSync(path.resolve(options.config)))
5733

58-
if(config.showHttpLinksDuring && stats.foundHttpLinks[item.url]){
59-
console.log();
60-
console.log('Http link(s) on '.bold.underline + item.url.bold.underline);
61-
stats.foundHttpLinks[item.url].forEach(function(link) {
62-
console.log(' ' + link);
63-
});
64-
}
65-
}
66-
67-
return links
68-
}
69-
70-
let totalErrorCount = 0
34+
const lighthouseQueue = queue(config, stats);
35+
const crawler = Crawler(config);
7136

72-
const lighthouseQueue = queue((url, callback) => {
73-
runLighthouse(url, config, (errorCount) => {
74-
totalErrorCount += errorCount
75-
callback()
76-
})
77-
}, config.maxChromeInstances || 5)
37+
crawler.discoverResources = discoverResources;
7838

7939
crawler.on('fetchcomplete', (queueItem, responseBuffer, response) => {
8040
lighthouseQueue.push(queueItem.url)
81-
})
41+
});
8242

8343
crawler.once('complete', () => {
8444
lighthouseQueue.drain = () => {
85-
printStats(config)
86-
if (totalErrorCount > 0) {
45+
printStats(stats)
46+
if (stats.totalErrorCount > 0) {
8747
process.exit(1)
8848
}
8949
}
9050
})
9151

9252
crawler.start()
93-
}
94-
95-
function runLighthouse (url, config, callback) {
96-
console.log('RUN LIGHTHOUSE');
97-
if(config.httpsOnly){
98-
url = url.replace("http://", "https://");
99-
}
100-
101-
stats.pageCount++
102-
var mixedContent = require.resolve('lighthouse/lighthouse-core/config/mixed-content.js')
103-
var chromeFlags = config.chromeFlags || '--headless --disable-gpu';
104-
var userAgent = config.userAgent || 'light-mc-crawler Mixed Content Crawler'
105-
const args = [
106-
url,
107-
'--output=json',
108-
'--output-path=stdout',
109-
'--disable-device-emulation',
110-
'--disable-cpu-throttling',
111-
'--enable-error-reporting',
112-
'--disable-storage-reset',
113-
'--disable-network-throttling',
114-
'--chrome-flags=' + chromeFlags + '--user-agent=' + userAgent,
115-
`--config-path=${mixedContent}`
116-
]
117-
118-
const lighthousePath = require.resolve('lighthouse/lighthouse-cli/index.js')
119-
const lighthouse = ChildProcess.spawn(lighthousePath, args)
120-
121-
let output = ''
122-
lighthouse.stdout.on('data', (data) => {
123-
console.log('DATATAT');
124-
output += data
125-
})
126-
127-
stats.auditTimesByPageUrl[url] = {startTime: new Date()}
128-
lighthouse.once('close', () => {
129-
console.log('CLOSE LIGHTHOUSE');
130-
stats.auditTimesByPageUrl[url].endTime = new Date()
131-
let errorCount = 0
132-
133-
let report
134-
try {
135-
report = JSON.parse(output)
136-
} catch (parseError) {
137-
console.log();
138-
if(output != ''){
139-
console.error(`Parsing JSON report output failed for ${url}: ${output}`);
140-
console.log(parseError);
141-
} else if (output.includes('Something went wrong')) {
142-
143-
console.log('SOMETHING WENT WRONG');
144-
console.log(output);
145-
146-
147-
148-
}else{
149-
console.error(`Lighthouse report returned nothing for ${url}`);
150-
}
151-
152-
callback(1)
153-
return
154-
}
155-
156-
report.reportCategories.forEach((category) => {
157-
let displayedCategory = false
158-
category.audits.forEach((audit) => {
159-
if(audit.id != "is-on-https"){
160-
//mixed-content is buggy atm, will work on fixing.
161-
//is-on-https seems to surface everything well enough
162-
return;
163-
}
164-
165-
if (audit.score === 100) {
166-
stats.passedAuditsCount++
167-
} else {
168-
if (!displayedCategory) {
169-
console.log();
170-
console.log(category.name.bold.underline + ` current page count: ${stats.pageCount}`);
171-
displayedCategory = true
172-
}
173-
errorCount++
174-
console.log(url.replace(/\/$/, ''), '\u2717'.red, audit.id.bold, '-', audit.result.description.italic)
175-
176-
if (stats.violationCounts[category.name] === undefined) {
177-
stats.violationCounts[category.name] = 0
178-
}
179-
180-
if (audit.result.extendedInfo) {
181-
const {value} = audit.result.extendedInfo
182-
if (Array.isArray(value)) {
183-
stats.violationCounts[category.name] += value.length
184-
value.forEach((result) => {
185-
if (result.url) {
186-
console.log(` ${result.url}`)
187-
}
188-
})
189-
} else if (Array.isArray(value.nodes)) {
190-
stats.violationCounts[category.name] += value.nodes.length
191-
const messagesToNodes = {}
192-
value.nodes.forEach((result) => {
193-
let message = result.failureSummary
194-
message = message.replace(/^Fix any of the following:/g, '').trim()
195-
if (messagesToNodes[message]) {
196-
messagesToNodes[message].push(result.html)
197-
} else {
198-
messagesToNodes[message] = [result.html]
199-
}
200-
})
201-
Object.keys(messagesToNodes).forEach((message) => {
202-
console.log(` ${message}`)
203-
messagesToNodes[message].forEach(node => {
204-
console.log(` ${node}`.gray)
205-
})
206-
})
207-
} else {
208-
stats.violationCounts[category.name]++
209-
}
210-
}else if(audit.result.details && audit.result.details.items){
211-
audit.result.details.items.forEach((result) => {
212-
if (result[0].text) {
213-
console.log(` ${result[0].text}`)
214-
}
215-
})
216-
}
217-
}
218-
})
219-
})
220-
221-
callback(errorCount)
222-
})
223-
}
224-
225-
function printStats(config) {
226-
console.log();
227-
console.log();
228-
if(config.showHttpLinksAfter){
229-
for(var index in stats.foundHttpLinks) {
230-
console.log('Http link(s) on '.bold.underline + index.bold.underline);
231-
stats.foundHttpLinks[index].forEach(function(link) {
232-
console.log(' ' + link);
233-
});
234-
}
235-
}
236-
console.log();
237-
console.log();
238-
console.log('Lighthouse Summary'.bold.underline);
239-
console.log(` Total Pages Scanned: ${stats.pageCount}`);
240-
console.log(` Total Auditing Time: ${new Date() - stats.startTime} ms`);
241-
const totalTime = Object.keys(stats.auditTimesByPageUrl).reduce((sum, url) => {
242-
const {endTime, startTime} = stats.auditTimesByPageUrl[url]
243-
return (endTime - startTime) + sum
244-
}, 0)
245-
console.log(` Average Page Audit Time: ${Math.round(totalTime/stats.pageCount)} ms`);
246-
console.log(` Total Audits Passed: ${stats.passedAuditsCount}`, '\u2713'.green);
247-
if (Object.keys(stats.violationCounts).length === 0) {
248-
console.log(` Total Violations: None! \\o/ 🎉`);
249-
} else {
250-
console.log(` Total Violations:`);
251-
Object.keys(stats.violationCounts).forEach(category => {
252-
console.log(` ${category}: ${stats.violationCounts[category]}`, '\u2717'.red);
253-
})
254-
}
255-
}
53+
}

0 commit comments

Comments
 (0)