1
- const cheerio = require ( 'cheerio' )
2
- const ChildProcess = require ( 'child_process' )
3
- const Crawler = require ( 'simplecrawler' )
4
- const path = require ( 'path' )
5
- const queue = require ( 'async/queue' )
6
- const fs = require ( 'fs' )
7
- const colors = require ( 'colors' )
8
- const util = require ( 'util' )
9
-
10
- const stats = {
1
+ const cheerio = require ( 'cheerio' ) ,
2
+ queue = require ( './src/queue' ) ,
3
+ path = require ( 'path' ) ,
4
+ fs = require ( 'fs' ) ,
5
+ Crawler = require ( './src/crawler' ) ,
6
+ printStats = require ( './src/log/final' ) ;
7
+
8
+ let config , stats = {
11
9
pageCount : 0 ,
12
- violationCounts : { } ,
13
- foundHttpLinks : { } ,
14
- passedAuditsCount : 0 ,
15
- startTime : null ,
16
- auditTimesByPageUrl : { }
10
+ totalErrorCount : 0 ,
11
+ startTime : new Date ( )
12
+ } ;
13
+
14
+ function discoverResources ( buffer , item ) {
15
+ const page = cheerio . load ( buffer . toString ( 'utf8' ) )
16
+ var links = page ( 'a[href]' ) . map ( function ( ) {
17
+ return page ( this ) . attr ( 'href' )
18
+ } ) . get ( )
19
+
20
+ if ( config . limit ) {
21
+ links = links . filter ( function ( s ) {
22
+ return ~ s . indexOf ( config . limit ) ;
23
+ } ) ;
24
+ }
25
+
26
+ return links
17
27
}
18
28
19
29
module . exports = ( options ) => {
20
30
console . log ( "ô¿ô light-mc-crawler has started crawling. If it looks like nothing is happening, wait, it is :)" ) ;
21
31
22
- stats . startTime = new Date ( )
23
-
24
- const configPath = path . resolve ( options . config )
25
- const config = JSON . parse ( fs . readFileSync ( configPath ) )
26
-
27
- const crawler = new Crawler ( options . url || config . url )
28
- crawler . respectRobotsTxt = false
29
- crawler . parseHTMLComments = false
30
- crawler . parseScriptTags = false
31
- crawler . userAgent = options . userAgent || "light-mc-crawler Mixed Content Crawler"
32
- crawler . maxDepth = config . maxDepth || 1
33
-
34
-
35
- crawler . discoverResources = ( buffer , item ) => {
36
- const page = cheerio . load ( buffer . toString ( 'utf8' ) )
37
- var links = page ( 'a[href]' ) . map ( function ( ) {
38
- return page ( this ) . attr ( 'href' )
39
- } ) . get ( )
40
-
41
- if ( config . limit ) {
42
- links = links . filter ( function ( s ) {
43
- return ~ s . indexOf ( config . limit ) ;
44
- } ) ;
45
- }
46
-
47
- if ( config . showHttpLinksDuring || config . showHttpLinksAfter ) {
48
- links . forEach ( function ( link ) {
49
- if ( link . indexOf ( 'http://' ) !== - 1 ) {
50
- if ( ! stats . foundHttpLinks [ item . url ] ) {
51
- stats . foundHttpLinks [ item . url ] = [ ] ;
52
- }
53
-
54
- stats . foundHttpLinks [ item . url ] . push ( link )
55
- }
56
- } ) ;
32
+ config = JSON . parse ( fs . readFileSync ( path . resolve ( options . config ) ) )
57
33
58
- if ( config . showHttpLinksDuring && stats . foundHttpLinks [ item . url ] ) {
59
- console . log ( ) ;
60
- console . log ( 'Http link(s) on ' . bold . underline + item . url . bold . underline ) ;
61
- stats . foundHttpLinks [ item . url ] . forEach ( function ( link ) {
62
- console . log ( ' ' + link ) ;
63
- } ) ;
64
- }
65
- }
66
-
67
- return links
68
- }
69
-
70
- let totalErrorCount = 0
34
+ const lighthouseQueue = queue ( config , stats ) ;
35
+ const crawler = Crawler ( config ) ;
71
36
72
- const lighthouseQueue = queue ( ( url , callback ) => {
73
- runLighthouse ( url , config , ( errorCount ) => {
74
- totalErrorCount += errorCount
75
- callback ( )
76
- } )
77
- } , config . maxChromeInstances || 5 )
37
+ crawler . discoverResources = discoverResources ;
78
38
79
39
crawler . on ( 'fetchcomplete' , ( queueItem , responseBuffer , response ) => {
80
40
lighthouseQueue . push ( queueItem . url )
81
- } )
41
+ } ) ;
82
42
83
43
crawler . once ( 'complete' , ( ) => {
84
44
lighthouseQueue . drain = ( ) => {
85
- printStats ( config )
86
- if ( totalErrorCount > 0 ) {
45
+ printStats ( stats )
46
+ if ( stats . totalErrorCount > 0 ) {
87
47
process . exit ( 1 )
88
48
}
89
49
}
90
50
} )
91
51
92
52
crawler . start ( )
93
- }
94
-
95
- function runLighthouse ( url , config , callback ) {
96
- console . log ( 'RUN LIGHTHOUSE' ) ;
97
- if ( config . httpsOnly ) {
98
- url = url . replace ( "http://" , "https://" ) ;
99
- }
100
-
101
- stats . pageCount ++
102
- var mixedContent = require . resolve ( 'lighthouse/lighthouse-core/config/mixed-content.js' )
103
- var chromeFlags = config . chromeFlags || '--headless --disable-gpu' ;
104
- var userAgent = config . userAgent || 'light-mc-crawler Mixed Content Crawler'
105
- const args = [
106
- url ,
107
- '--output=json' ,
108
- '--output-path=stdout' ,
109
- '--disable-device-emulation' ,
110
- '--disable-cpu-throttling' ,
111
- '--enable-error-reporting' ,
112
- '--disable-storage-reset' ,
113
- '--disable-network-throttling' ,
114
- '--chrome-flags=' + chromeFlags + '--user-agent=' + userAgent ,
115
- `--config-path=${ mixedContent } `
116
- ]
117
-
118
- const lighthousePath = require . resolve ( 'lighthouse/lighthouse-cli/index.js' )
119
- const lighthouse = ChildProcess . spawn ( lighthousePath , args )
120
-
121
- let output = ''
122
- lighthouse . stdout . on ( 'data' , ( data ) => {
123
- console . log ( 'DATATAT' ) ;
124
- output += data
125
- } )
126
-
127
- stats . auditTimesByPageUrl [ url ] = { startTime : new Date ( ) }
128
- lighthouse . once ( 'close' , ( ) => {
129
- console . log ( 'CLOSE LIGHTHOUSE' ) ;
130
- stats . auditTimesByPageUrl [ url ] . endTime = new Date ( )
131
- let errorCount = 0
132
-
133
- let report
134
- try {
135
- report = JSON . parse ( output )
136
- } catch ( parseError ) {
137
- console . log ( ) ;
138
- if ( output != '' ) {
139
- console . error ( `Parsing JSON report output failed for ${ url } : ${ output } ` ) ;
140
- console . log ( parseError ) ;
141
- } else if ( output . includes ( 'Something went wrong' ) ) {
142
-
143
- console . log ( 'SOMETHING WENT WRONG' ) ;
144
- console . log ( output ) ;
145
-
146
-
147
-
148
- } else {
149
- console . error ( `Lighthouse report returned nothing for ${ url } ` ) ;
150
- }
151
-
152
- callback ( 1 )
153
- return
154
- }
155
-
156
- report . reportCategories . forEach ( ( category ) => {
157
- let displayedCategory = false
158
- category . audits . forEach ( ( audit ) => {
159
- if ( audit . id != "is-on-https" ) {
160
- //mixed-content is buggy atm, will work on fixing.
161
- //is-on-https seems to surface everything well enough
162
- return ;
163
- }
164
-
165
- if ( audit . score === 100 ) {
166
- stats . passedAuditsCount ++
167
- } else {
168
- if ( ! displayedCategory ) {
169
- console . log ( ) ;
170
- console . log ( category . name . bold . underline + ` current page count: ${ stats . pageCount } ` ) ;
171
- displayedCategory = true
172
- }
173
- errorCount ++
174
- console . log ( url . replace ( / \/ $ / , '' ) , '\u2717' . red , audit . id . bold , '-' , audit . result . description . italic )
175
-
176
- if ( stats . violationCounts [ category . name ] === undefined ) {
177
- stats . violationCounts [ category . name ] = 0
178
- }
179
-
180
- if ( audit . result . extendedInfo ) {
181
- const { value} = audit . result . extendedInfo
182
- if ( Array . isArray ( value ) ) {
183
- stats . violationCounts [ category . name ] += value . length
184
- value . forEach ( ( result ) => {
185
- if ( result . url ) {
186
- console . log ( ` ${ result . url } ` )
187
- }
188
- } )
189
- } else if ( Array . isArray ( value . nodes ) ) {
190
- stats . violationCounts [ category . name ] += value . nodes . length
191
- const messagesToNodes = { }
192
- value . nodes . forEach ( ( result ) => {
193
- let message = result . failureSummary
194
- message = message . replace ( / ^ F i x a n y o f t h e f o l l o w i n g : / g, '' ) . trim ( )
195
- if ( messagesToNodes [ message ] ) {
196
- messagesToNodes [ message ] . push ( result . html )
197
- } else {
198
- messagesToNodes [ message ] = [ result . html ]
199
- }
200
- } )
201
- Object . keys ( messagesToNodes ) . forEach ( ( message ) => {
202
- console . log ( ` ${ message } ` )
203
- messagesToNodes [ message ] . forEach ( node => {
204
- console . log ( ` ${ node } ` . gray )
205
- } )
206
- } )
207
- } else {
208
- stats . violationCounts [ category . name ] ++
209
- }
210
- } else if ( audit . result . details && audit . result . details . items ) {
211
- audit . result . details . items . forEach ( ( result ) => {
212
- if ( result [ 0 ] . text ) {
213
- console . log ( ` ${ result [ 0 ] . text } ` )
214
- }
215
- } )
216
- }
217
- }
218
- } )
219
- } )
220
-
221
- callback ( errorCount )
222
- } )
223
- }
224
-
225
- function printStats ( config ) {
226
- console . log ( ) ;
227
- console . log ( ) ;
228
- if ( config . showHttpLinksAfter ) {
229
- for ( var index in stats . foundHttpLinks ) {
230
- console . log ( 'Http link(s) on ' . bold . underline + index . bold . underline ) ;
231
- stats . foundHttpLinks [ index ] . forEach ( function ( link ) {
232
- console . log ( ' ' + link ) ;
233
- } ) ;
234
- }
235
- }
236
- console . log ( ) ;
237
- console . log ( ) ;
238
- console . log ( 'Lighthouse Summary' . bold . underline ) ;
239
- console . log ( ` Total Pages Scanned: ${ stats . pageCount } ` ) ;
240
- console . log ( ` Total Auditing Time: ${ new Date ( ) - stats . startTime } ms` ) ;
241
- const totalTime = Object . keys ( stats . auditTimesByPageUrl ) . reduce ( ( sum , url ) => {
242
- const { endTime, startTime} = stats . auditTimesByPageUrl [ url ]
243
- return ( endTime - startTime ) + sum
244
- } , 0 )
245
- console . log ( ` Average Page Audit Time: ${ Math . round ( totalTime / stats . pageCount ) } ms` ) ;
246
- console . log ( ` Total Audits Passed: ${ stats . passedAuditsCount } ` , '\u2713' . green ) ;
247
- if ( Object . keys ( stats . violationCounts ) . length === 0 ) {
248
- console . log ( ` Total Violations: None! \\o/ 🎉` ) ;
249
- } else {
250
- console . log ( ` Total Violations:` ) ;
251
- Object . keys ( stats . violationCounts ) . forEach ( category => {
252
- console . log ( ` ${ category } : ${ stats . violationCounts [ category ] } ` , '\u2717' . red ) ;
253
- } )
254
- }
255
- }
53
+ }
0 commit comments