|
3 | 3 | import json
|
4 | 4 | import pathlib
|
5 | 5 | import random
|
| 6 | +import re |
6 | 7 | import socket
|
| 8 | +from collections import defaultdict |
7 | 9 | from urllib.parse import urlparse
|
8 | 10 |
|
9 | 11 | import requests
|
|
31 | 33 | "https://www.baeldung.com/",
|
32 | 34 | ]
|
33 | 35 |
|
34 |
| -def show_files(filenames): |
35 |
| - for filename in filenames: |
36 |
| - print(filename) |
| 36 | +RULE_LANG_IN_PATH = re.compile(r'.*[\/\\](S\d{3,})[\/\\]([^\/]*)[\/\\]rule.html') |
| 37 | + |
| 38 | +def report_files(filenames): |
| 39 | + lang_by_rule = defaultdict(list) |
| 40 | + for file in filenames: |
| 41 | + m = re.fullmatch(RULE_LANG_IN_PATH, file) |
| 42 | + if m is not None: |
| 43 | + lang_by_rule[m[1]].append(m[2]) |
| 44 | + res = '' |
| 45 | + for k, v in lang_by_rule.items(): |
| 46 | + langs = ','.join(v) |
| 47 | + res += f'| {k} ({langs})' |
| 48 | + return res |
| 49 | + |
| 50 | + |
| 51 | +def error_message_for_domain(errors, urls): |
| 52 | + return '|\n'.join(f'| {key} in:\n' + report_files(urls[key]) for key in errors) |
| 53 | + |
37 | 54 |
|
38 | 55 | def load_url_probing_history():
|
39 | 56 | global link_probes_history
|
@@ -222,9 +239,7 @@ def report_errors(errors, urls):
|
222 | 239 | by_domain = dict((k, list(g)) for k, g in itertools.groupby(errors, lambda url: urlparse(url).netloc))
|
223 | 240 | for k, v in by_domain.items():
|
224 | 241 | print(f"For domain = {k}")
|
225 |
| - for key in v: |
226 |
| - print(f"{key} in:") |
227 |
| - show_files(urls[key]) |
| 242 | + print(error_message_for_domain(v, urls)) |
228 | 243 | print("")
|
229 | 244 |
|
230 | 245 | def check_html_links(dir):
|
|
0 commit comments