Skip to content

Commit 981cac1

Browse files
committed
fix: remove www from hostname; strip &nbsp
1 parent 1b4539a commit 981cac1

File tree

5 files changed

+43
-3
lines changed

5 files changed

+43
-3
lines changed

package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"name": "recipe-scrapers-js",
3-
"version": "0.1.0-alpha.3",
3+
"version": "0.1.0-alpha.4",
44
"license": "MIT",
55
"description": "A recipe scrapers library",
66
"author": {

src/plugins/__tests__/html-stripper.processor.test.ts

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,9 @@ describe('HtmlStripperPlugin', () => {
2020
expect(plugin.process('title', '<span>Test &lt;tag&gt;</span>')).toBe(
2121
'Test <tag>',
2222
)
23+
expect(plugin.process('description', '<span>Hello&nbsp;World</span>')).toBe(
24+
'Hello World',
25+
)
2326
})
2427

2528
it('strips HTML from instructions Set<string>', () => {

src/plugins/html-stripper.processor.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,7 @@ export class HtmlStripperPlugin extends PostProcessorPlugin {
6969
return html
7070
.replace(/<[^>]*>/g, '') // Remove HTML tags
7171
.replace(/&amp;/g, '&') // Decode common entities
72+
.replace(/&nbsp;/g, ' ')
7273
.replace(/&lt;/g, '<')
7374
.replace(/&gt;/g, '>')
7475
.replace(/&quot;/g, '"')

src/utils/__tests__/index.test.ts

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import { describe, expect, it } from 'bun:test'
22
import {
3+
getHostName,
34
isDefined,
45
isFunction,
56
isNumber,
@@ -92,3 +93,36 @@ describe('isString', () => {
9293
expect(isString({})).toBe(false)
9394
})
9495
})
96+
describe('getHostName', () => {
97+
it('should return the host for a standard URL', () => {
98+
expect(getHostName('https://www.example.com/path')).toBe('example.com')
99+
})
100+
101+
it('should return the host for a URL with a subdomain', () => {
102+
expect(getHostName('http://sub.domain.co.uk/page?q=1')).toBe(
103+
'sub.domain.co.uk',
104+
)
105+
})
106+
107+
it('should return the host for a URL without a path', () => {
108+
expect(getHostName('https://anothersite.org')).toBe('anothersite.org')
109+
})
110+
111+
it('should throw an error for an invalid URL string', () => {
112+
const invalidUrl = 'not a url'
113+
expect(() => getHostName(invalidUrl)).toThrow(
114+
new Error(`Invalid URL: ${invalidUrl}`),
115+
)
116+
})
117+
118+
it('should throw an error for an empty string', () => {
119+
expect(() => getHostName('')).toThrow(new Error('Invalid URL: '))
120+
})
121+
122+
it('should throw an error for a string that looks like a host but lacks a protocol', () => {
123+
const urlWithoutProtocol = 'example.com'
124+
expect(() => getHostName(urlWithoutProtocol)).toThrow(
125+
new Error(`Invalid URL: ${urlWithoutProtocol}`),
126+
)
127+
})
128+
})

src/utils/index.ts

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,11 +26,13 @@ export function isString(value: unknown): value is string {
2626
}
2727

2828
/**
29-
* Extracts the host name from a URL string.
29+
* Extracts the host name from a URL string
30+
* and removes 'www.' prefix if present.
31+
* Throws an error if the input is not a valid URL.
3032
*/
3133
export function getHostName(value: string) {
3234
try {
33-
const url = new URL(value)
35+
const url = new URL(value.replace('www.', ''))
3436
return url.host
3537
} catch {
3638
throw new Error(`Invalid URL: ${value}`)

0 commit comments

Comments
 (0)