Skip to content

Commit 558efc0

Browse files
committed
added support for bloomberg via search API, added autofocus to article page, added additional article text sanitization, reformatted mapping file
1 parent 4d7ca39 commit 558efc0

File tree

9 files changed

+108
-55
lines changed

9 files changed

+108
-55
lines changed

.env.example

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1,3 @@
1-
PORT=3030
1+
PORT=3030
2+
GOOGLE_SEARCH_ID=xxxxxxxxxx
3+
GOOGLE_SEARCH_KEY=yyyyyyyyyy

README.md

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,17 +13,27 @@ URL: https://reader.dangerous.dev
1313
* LA Times (latimes.com)
1414
* The Athletic (theathletic.com)
1515
* Business Insider (businessinsider.com)
16+
* Bloomberg (bloomberg.com)
1617
* Vogue (vogue.com)
1718

18-
### Unsupported Sites
19-
* Wall Street Journal (wsj.com)
19+
### Prerequisites
20+
21+
1. Register a custom Google search engine by going to https://developers.google.com/custom-search/v1/introduction and pressing "Get A Key"
22+
2. As per their site:
23+
24+
>Once it is created, you can find the engine's ID in the Setup > Basics > Search Engine ID section of the Control Panel
25+
26+
This is where you will configure what sites your search will search in. For the purposes of this app in its current state, we will just use bloomberg.com
27+
28+
Take note of your API key and Search Engine ID.
2029

2130
### Installation
2231

2332
```bash
2433
git clone https://github.com/joshterrill/paywall-reader
2534
cd paywall-reader/
2635
cp .env.example .env
36+
# replace GOOGLE_API_KEY and GOOGLE_SEARCH_ID with values from prerequisites section
2737
npm i
2838
npm start
2939
```
@@ -37,4 +47,4 @@ Pull requests would gladly be accepted for adding support for more sites (as lon
3747
### Todo
3848
* Fix relative and absolute links in embedded html to point to domain they should be coming from
3949
* Add dom sanitization for incoming HTML
40-
* Add checks for source and URL fields on requests to ensure the URL matches the URL in the `news-source-map.json` file
50+
* Add checks for source and URL fields on requests to ensure the URL matches the URL in the `news-source-map.json` file

index.js

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,8 +34,8 @@ app.get('/read', async (req, res) => {
3434
if (!source || !url) {
3535
throw new Error('Source or URL not provided');
3636
}
37-
const direct = newsSourceMapping[source].direct;
38-
const { articleText, articleHeadline } = await parse.getContent(source, url, direct);
37+
const sourceMapping = newsSourceMapping[source];
38+
const { articleText, articleHeadline } = await parse.getContent(source, url, sourceMapping.method);
3939
res.render('read', {source, sourceText: newsSourceMapping[source].name, articleText, articleHeadline});
4040
} catch (error) {
4141
console.log(error);

news-source-map.json

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,12 @@
11
{
2-
"nyt": {"name": "New York Times", "url": "nytimes.com"},
3-
"nytcooking": {"name": "New York Times Cooking", "url": "cooking.nytimes.com"},
4-
"newyorker": {"name": "The New Yorker", "url": "newyorker.com"},
5-
"economist": {"name": "The Economist", "url": "economist.com"},
6-
"washingtonpost": {"name": "Washington Post", "url": "washingtonpost.com"},
7-
"latimes": {"name": "LA Times", "url": "latimes.com", "direct": true},
8-
"theathletic": {"name": "The Athletic", "url": "theathletic.com", "direct": true},
9-
"businessinsider": {"name": "Business Insider", "url": "businessinsider.com"},
10-
"vogue": {"name": "Vogue", "url": "vogue.com"}
2+
"nytimes.com": {"name": "New York Times", "method": "ARCHIVE"},
3+
"cooking.nytimes.com": {"name": "New York Times Cooking", "method": "ARCHIVE"},
4+
"newyorker.com": {"name": "The New Yorker", "method": "ARCHIVE"},
5+
"economist.com": {"name": "The Economist", "method": "ARCHIVE"},
6+
"washingtonpost.com": {"name": "Washington Post", "method": "ARCHIVE"},
7+
"latimes.com": {"name": "LA Times", "method": "DIRECT"},
8+
"theathletic.com": {"name": "The Athletic", "method": "DIRECT"},
9+
"businessinsider.com": {"name": "Business Insider", "method": "ARCHIVE"},
10+
"bloomberg.com": {"name": "Bloomberg", "method": "GOOGLE"},
11+
"vogue.com": {"name": "Vogue", "method": "ARCHIVE"}
1112
}

package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
{
22
"name": "paywall-reader",
33
"description": "A web app that lets you read articles on popular news sites that get hidden behind paywalls.",
4-
"version": "1.0.0",
4+
"version": "1.1.2",
55
"main": "index.js",
66
"scripts": {
77
"start": "node index.js",

parser.js

Lines changed: 54 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ function sanitizeUrl(url) {
66
return url.split('?')[0];
77
}
88

9-
async function checkUrl(url) {
9+
async function checkUrlArchive(url) {
1010
url = sanitizeUrl(url);
1111
const data = await fetch(`https://archive.org/wayback/available?url=${url}`);
1212
if (!data) {
@@ -19,6 +19,19 @@ async function checkUrl(url) {
1919
return json.archived_snapshots.closest.url;
2020
}
2121

22+
async function checkUrlGoogle(url, site) {
23+
url = sanitizeUrl(url);
24+
const searchTerm = url.split('/')[url.split('/').length - 1];
25+
const res = await fetch(`https://content-customsearch.googleapis.com/customsearch/v1?cx=${process.env.GOOGLE_SEARCH_ID}&key=${process.env.GOOGLE_SEARCH_KEY}&q=${searchTerm}`)
26+
const json = await res.json();
27+
if (!json?.items?.length) {
28+
throw new Error('Unable to get result from search engine');
29+
}
30+
const { cacheId } = json.items[0];
31+
const webCacheUrl = `http://webcache.googleusercontent.com/search?q=cache:${cacheId}:${site}`;
32+
return webCacheUrl;
33+
}
34+
2235
async function nyt(url) {
2336
const rawHtml = await fetch(url);
2437
const html = await rawHtml.text();
@@ -41,10 +54,10 @@ async function newyorker(url) {
4154
const rawHtml = await fetch(url);
4255
const html = await rawHtml.text();
4356
const $ = cheerio.load(html);
44-
const scriptTag = $('script[type="application/ld+json"]').text().split(',\'keywords\':');
45-
const badJsonFixer = JSON.parse(`${scriptTag[0]}}`); // wtf
46-
const articleText = marked.parse(badJsonFixer.articleBody);
47-
const articleHeadline = badJsonFixer.headline;
57+
const scriptTag = $('script[type="application/ld+json"]').first().text().split(',\'keywords\':');
58+
const json = JSON.parse(scriptTag[0]);;
59+
const articleText = marked.parse(json.articleBody);
60+
const articleHeadline = json.headline;
4861
return { articleText, articleHeadline };
4962
}
5063

@@ -118,11 +131,23 @@ async function businessInsider(url) {
118131
const rootUrl = Object.keys(imageJson)[0];
119132
image.parent().html(`<img src="${decodeURIComponent(rootUrl)}" />`);
120133
});
134+
$('.inline-newsletter-signup').remove();
121135
const articleHtml = $('.content-lock-content').html();
122136
const articleText = articleHtml;
123137
return { articleText, articleHeadline };
124138
}
125139

140+
async function bloomberg(url) {
141+
const rawHtml = await fetch(url);
142+
const html = await rawHtml.text();
143+
const $ = cheerio.load(html);
144+
const scriptTag = $('script[data-component-props="ArticleBody"]').text();
145+
const json = JSON.parse(scriptTag);
146+
const articleText = json.story.body.replace(/60x-1/g, '1200x-1'); // replace low res images with higher res
147+
const articleHeadline = json.story.seoHeadline;
148+
return { articleText, articleHeadline };
149+
}
150+
126151
async function vogue(url) {
127152
const rawHtml = await fetch(url);
128153
const html = await rawHtml.text();
@@ -138,59 +163,70 @@ async function vogue(url) {
138163
return { articleText, articleHeadline };
139164
}
140165

141-
async function getContent(source, url, direct) {
166+
async function getContent(source, url, method) {
167+
console.log(source, url, method);
142168
let articleText = null;
143169
let articleHeadline = null;
144-
if (!direct) {
145-
url = await checkUrl(url);
170+
if (method === 'ARCHIVE') {
171+
url = await checkUrlArchive(url);
172+
} else if (method === 'GOOGLE') {
173+
url = await checkUrlGoogle(url, source);
146174
}
175+
console.log(url);
147176
switch(source) {
148-
case 'nyt':
177+
case 'nytimes.com':
149178
const nytRes = await nyt(url);
150179
articleText = nytRes.articleText;
151180
articleHeadline = nytRes.articleHeadline;
152181
break;
153-
case 'nytcooking':
182+
case 'cooking.nytimes.com':
154183
const nytCookingRes = await nytCooking(url);
155184
articleText = nytCookingRes.articleText;
156185
articleHeadline = nytCookingRes.articleHeadline;
157186
break;
158-
case 'newyorker':
187+
case 'newyorker.com':
159188
const newyorkerRes = await newyorker(url);
160189
articleText = newyorkerRes.articleText;
161190
articleHeadline = newyorkerRes.articleHeadline;
162191
break;
163-
case 'economist':
192+
case 'economist.com':
164193
const economistRes = await economist(url);
165194
articleText = economistRes.articleText;
166195
articleHeadline = economistRes.articleHeadline;
167196
break;
168-
case 'washingtonpost':
197+
case 'washingtonpost.com':
169198
const washingtonPostRes = await washingtonPost(url);
170199
articleText = washingtonPostRes.articleText;
171200
articleHeadline = washingtonPostRes.articleHeadline;
172201
break;
173-
case 'latimes':
202+
case 'latimes.com':
174203
const laTimesRes = await latimes(url);
175204
articleText = laTimesRes.articleText;
176205
articleHeadline = laTimesRes.articleHeadline;
177206
break;
178-
case 'theathletic':
207+
case 'theathletic.com':
179208
const theAthleticRes = await theAthletic(url);
180209
articleText = theAthleticRes.articleText;
181210
articleHeadline = theAthleticRes.articleHeadline;
182211
break;
183-
case 'businessinsider':
212+
case 'businessinsider.com':
184213
const businessInsiderRes = await businessInsider(url);
185214
articleText = businessInsiderRes.articleText;
186215
articleHeadline = businessInsiderRes.articleHeadline;
187216
break;
188-
case 'vogue':
217+
case 'bloomberg.com':
218+
const bloombergRes = await bloomberg(url);
219+
articleText = bloombergRes.articleText;
220+
articleHeadline = bloombergRes.articleHeadline;
221+
break;
222+
case 'vogue.com':
189223
const vogueRes = await vogue(url);
190224
articleText = vogueRes.articleText;
191225
articleHeadline = vogueRes.articleHeadline;
192226
break;
193-
227+
default:
228+
articleText = 'No article found';
229+
articleHeadline = '404';
194230
}
195231
return {articleText, articleHeadline};
196232
}

views/article.handlebars

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,6 @@
22

33
<form action="/read" method="GET">
44
<input type="hidden" name="source" value="{{source}}" />
5-
<input class="form-control mb-2" type="text" name="url" placeholder="Type {{sourceText}} URL here" />
5+
<input class="form-control mb-2" type="text" name="url" placeholder="Type {{sourceText}} URL here" autofocus />
66
<button class="btn btn-success">Submit</button>
77
</form>

views/home.handlebars

Lines changed: 23 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,8 @@
11
<p class="lead">A web app that lets you read articles on popular news sites that get hidden behind paywalls.</p>
22

33
<div class="row">
4-
{{!-- <div class="col-sm-6 col-lg-4 mb-4">
5-
<div class="card">
6-
<div class="card-body">
7-
<h5 class="card-title">Card title</h5>
8-
<p class="card-text">This is another card with title and supporting text below. This card has some
9-
additional content to make it slightly taller overall.</p>
10-
<p class="card-text"><small class="text-muted">Last updated 3 mins ago</small></p>
11-
</div>
12-
</div>
13-
</div> --}}
144
<div class="col-sm-6 col-lg-4 mb-4">
15-
<a href="/article?source=nyt">
5+
<a href="/article?source=nytimes.com">
166
<div class="card p-5 text-white bg-primary rounded-3">
177
<figure class="mb-0">
188
<blockquote class="blockquote">
@@ -26,7 +16,7 @@
2616
</a>
2717
</div>
2818
<div class="col-sm-6 col-lg-4 mb-4">
29-
<a href="/article?source=nytcooking">
19+
<a href="/article?source=cooking.nytimes.com">
3020
<div class="card p-5 text-white bg-success rounded-3">
3121
<figure class="mb-0">
3222
<blockquote class="blockquote">
@@ -40,7 +30,7 @@
4030
</a>
4131
</div>
4232
<div class="col-sm-6 col-lg-4 mb-4">
43-
<a href="/article?source=newyorker">
33+
<a href="/article?source=newyorker.com">
4434
<div class="card p-5 text-white bg-info rounded-3">
4535
<figure class="mb-0">
4636
<blockquote class="blockquote">
@@ -54,7 +44,7 @@
5444
</a>
5545
</div>
5646
<div class="col-sm-6 col-lg-4 mb-4">
57-
<a href="/article?source=economist">
47+
<a href="/article?source=economist.com">
5848
<div class="card p-5 text-white bg-danger rounded-3">
5949
<figure class="mb-0">
6050
<blockquote class="blockquote">
@@ -68,7 +58,7 @@
6858
</a>
6959
</div>
7060
<div class="col-sm-6 col-lg-4 mb-4">
71-
<a href="/article?source=washingtonpost">
61+
<a href="/article?source=washingtonpost.com">
7262
<div class="card p-5 bg-warning border rounded-3">
7363
<figure class="mb-0">
7464
<blockquote class="blockquote text-dark">
@@ -82,7 +72,7 @@
8272
</a>
8373
</div>
8474
<div class="col-sm-6 col-lg-4 mb-4">
85-
<a href="/article?source=latimes">
75+
<a href="/article?source=latimes.com">
8676
<div class="card p-5 bg-secondary border rounded-3">
8777
<figure class="mb-0">
8878
<blockquote class="blockquote text-white">
@@ -96,7 +86,7 @@
9686
</a>
9787
</div>
9888
<div class="col-sm-6 col-lg-4 mb-4">
99-
<a href="/article?source=theathletic">
89+
<a href="/article?source=theathletic.com">
10090
<div class="card p-5 bg-dark border rounded-3">
10191
<figure class="mb-0">
10292
<blockquote class="blockquote text-white">
@@ -110,7 +100,7 @@
110100
</a>
111101
</div>
112102
<div class="col-sm-6 col-lg-4 mb-4">
113-
<a href="/article?source=businessinsider">
103+
<a href="/article?source=businessinsider.com">
114104
<div class="card p-5 bg-light border rounded-3">
115105
<figure class="mb-0">
116106
<blockquote class="blockquote text-dark">
@@ -124,8 +114,22 @@
124114
</a>
125115
</div>
126116
<div class="col-sm-6 col-lg-4 mb-4">
127-
<a href="/article?source=vogue">
117+
<a href="/article?source=bloomberg.com">
128118
<div class="card p-5 bg-primary border rounded-3">
119+
<figure class="mb-0">
120+
<blockquote class="blockquote text-white">
121+
<p>Bloomberg</p>
122+
</blockquote>
123+
<figcaption class="mb-0 text-white">
124+
bloomberg.com
125+
</figcaption>
126+
</figure>
127+
</div>
128+
</a>
129+
</div>
130+
<div class="col-sm-6 col-lg-4 mb-4">
131+
<a href="/article?source=vogue.com">
132+
<div class="card p-5 bg-success border rounded-3">
129133
<figure class="mb-0">
130134
<blockquote class="blockquote text-white">
131135
<p>Vogue</p>

views/read.handlebars

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
<h3 class="text-secondary">{{articleHeadline}} <span class="badge bg-dark source-badge">{{sourceText}}</span></h3>
1+
<h3 class="text-secondary">{{{articleHeadline}}} <span class="badge bg-dark source-badge">{{sourceText}}</span></h3>
22
<hr />
33
<div class="article-text-container">
44
{{{articleText}}}

0 commit comments

Comments
 (0)