forked from lewisdonovan/google-news-scraper
-
Notifications
You must be signed in to change notification settings - Fork 0
/
getPrettyUrl.js
61 lines (53 loc) · 1.71 KB
/
getPrettyUrl.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
const getPrettyUrl = url => {
const base64Pattern = /articles\/([A-Za-z0-9+_\-\/=]+)/;
const match = url.match(base64Pattern);
if (match && match[1]) {
const base64EncodedUrl = match[1].replace(/-/g, "+").replace(/_/g, "/");
try {
let decodedUrl = Buffer.from(base64EncodedUrl, "base64").toString("ascii");
// Remove any trailing "R" if it's the last character
decodedUrl = decodedUrl.replace(/R/, "");
// Remove non-ASCII characters and split by potential delimiters
const parts = decodedUrl.split(/[^\x20-\x7E]+/).filter(Boolean);
// Regular expression to validate and extract URLs
const urlPattern = /(https?:\/\/[^\s]+)/;
let cleanedUrl = "";
// Iterate over parts to find the first valid URL
for (let part of parts) {
const urlMatch = part.match(urlPattern);
if (urlMatch && urlMatch[1]) {
cleanedUrl = urlMatch[1];
break; // Stop at the first match
}
}
if (cleanedUrl) {
// Log the cleaned URL in a well-formatted JSON
const output = {
originalUrl: url,
cleanedUrl: cleanedUrl
};
// console.log(JSON.stringify(output, null, 2));
return cleanedUrl;
} else {
console.error("No valid URL found in the decoded string:", decodedUrl);
return url;
}
} catch (error) {
console.error(
"Error decoding Base64 string:",
base64EncodedUrl,
"Original URL:",
url,
"Error:",
error.message,
);
return url;
}
} else {
console.error("No Base64 segment found in the URL. Original URL:", url);
return url;
}
}
module.exports = {
default: getPrettyUrl
}