-
Notifications
You must be signed in to change notification settings - Fork 1
/
publicsuffix.go
389 lines (313 loc) · 10.7 KB
/
publicsuffix.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
/*
Copyright 2018 GMO GlobalSign Ltd
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// Package publicsuffix provides functions to query the public suffix list found
// at:
//
// https://publicsuffix.org/
//
// When first initialised, this library uses a statically compiled list which
// may be out of date - callers should use Update to attempt to fetch a new
// version from the official GitHub repository. Alternate data sources (such as
// a network share, etc) can be used by implementing the ListRetriever
// interface.
//
// A list can be serialised using Write, and loaded using Read - this allows the
// caller to write the updated internal list to disk at shutdown and resume
// using it immediately on the next start.
//
// All exported functions are concurrency safe and the internal list uses
// copy-on-write during updates to avoid blocking queries.
package publicsuffix
import (
"bufio"
"bytes"
"compress/zlib"
"encoding/json"
"fmt"
"io"
"net/http"
"regexp"
"strings"
"sync"
"sync/atomic"
"golang.org/x/net/idna"
)
//go:generate go run gen.go
// rulesInfo contains the map of rules and the commit version that generated them
type rulesInfo struct {
Map map[string][]rule
Release string
}
// rule contains the data related to a domain from the PSL
type rule struct {
DottedName string
RuleType ruleType
ICANN bool
}
type subdomain struct {
name string
dottedName string
}
// ruleType encapsulates integer for enum
type ruleType int
const (
normal ruleType = iota
wildcard
exception
)
// icannBegin marks the beginning of ICANN domains in the public suffix list
// source file.
const icannBegin = "BEGIN ICANN DOMAINS"
// icannEnd marks the ending of ICANN domains in the public suffix list
// source file.
const icannEnd = "END ICANN DOMAINS"
var (
// validSuffixRE is used to check that the entries in the public suffix
// list are in canonical form (after Punycode encoding). Specifically,
// capital letters are not allowed.
validSuffixRE = regexp.MustCompile(`^[a-z0-9_\!\*\-\.]+$`)
// rules caches the PSL from the last commit available
// handles read/write concurrency
rules atomic.Value
// subdomainPool pools subdomain arrays to avoid reallocation cost
subdomainPool = sync.Pool{
New: func() interface{} {
// 5 should cover the average domain
return make([]subdomain, 5)
},
}
)
func init() {
if err := Read(bytes.NewReader(listBytes)); err != nil {
panic(fmt.Sprintf("error while initialising Public Suffix List from list.go: %s", err.Error()))
}
// not used after initialisation, set to nil for garbage collector
listBytes = nil
}
func load() rulesInfo {
return rules.Load().(rulesInfo)
}
// Write atomically encodes the currently loaded public suffix list as JSON and compresses and
// writes it to w.
func Write(w io.Writer) error {
// Wrap w in zlib Writer
var zlibWriter = zlib.NewWriter(w)
defer zlibWriter.Close()
// Encode directly into the zlib writer, which in turn writes into w.
return json.NewEncoder(zlibWriter).Encode(load())
}
// Read loads a public suffix list serialised and compressed by Write and uses it for future
// lookups.
func Read(r io.Reader) error {
var zlibReader, err = zlib.NewReader(r)
if err != nil {
return fmt.Errorf("zlib error: %s", err.Error())
}
defer zlibReader.Close()
var tempRulesInfo = rulesInfo{}
if err := json.NewDecoder(zlibReader).Decode(&tempRulesInfo); err != nil {
return fmt.Errorf("json error: %s", err.Error())
}
rules.Store(tempRulesInfo)
return nil
}
// Update fetches the latest public suffix list from the official github
// repository and uses it for future lookups.
//
// https://github.com/publicsuffix/list
//
func Update() error {
return UpdateWithListRetriever(gitHubListRetriever{http.DefaultClient})
}
// UpdateWithListRetriever attempts to update the internal public suffix list
// using listRetriever as a data source.
//
// UpdateWithListRetriever is provided to allow callers to provide custom update
// sources, such as reading from a network store or local cache instead of
// fetching from the GitHub repository.
func UpdateWithListRetriever(listRetriever ListRetriever) error {
var latestTag, err = listRetriever.GetLatestReleaseTag()
if err != nil {
return fmt.Errorf("error while retrieving last commit information: %s", err.Error())
}
if load().Release == latestTag {
return nil
}
var rawList io.Reader
rawList, err = listRetriever.GetList(latestTag)
if err != nil {
return fmt.Errorf("error while retrieving Public Suffix List last release (%s): %s", latestTag, err.Error())
}
var rulesInfo *rulesInfo
rulesInfo, err = newList(rawList, latestTag)
if err != nil {
return err
}
rules.Store(*rulesInfo)
return nil
}
// HasPublicSuffix returns true if the TLD of domain is in the public suffix
// list.
func HasPublicSuffix(domain string) bool {
var _, _, found = searchList(domain)
return found
}
// PublicSuffix returns the public suffix of the domain using a copy of the
// internal public suffix list.
//
// The returned bool is true when the public suffix is managed by the Internet
// Corporation for Assigned Names and Numbers. If false, the public suffix is
// privately managed. For example, foo.org and foo.co.uk are ICANN domains,
// foo.dyndns.org and foo.blogspot.co.uk are private domains.
func PublicSuffix(domain string) (string, bool) {
var publicsuffix, icann, _ = searchList(domain)
return publicsuffix, icann
}
// EffectiveTLDPlusOne returns the effective top level domain plus one more
// label. For example, the eTLD+1 for "foo.bar.golang.org" is "golang.org".
func EffectiveTLDPlusOne(domain string) (string, error) {
var suffix, _ = PublicSuffix(domain)
if len(domain) <= len(suffix) {
return "", fmt.Errorf("publicsuffix: cannot derive eTLD+1 for domain %q", domain)
}
var i = len(domain) - len(suffix) - 1
if domain[i] != '.' {
return "", fmt.Errorf("publicsuffix: invalid public suffix %q for domain %q", suffix, domain)
}
return domain[1+strings.LastIndex(domain[:i], "."):], nil
}
// Release returns the release of the current internal public suffix list.
func Release() string {
return load().Release
}
// searchList looks for the given domain in the Public Suffix List and returns
// the suffix, a flag indicating if it's managed by the Internet Corporation,
// and a flag indicating if it was found in the list
func searchList(domain string) (string, bool, bool) {
// If the domain ends on a dot the subdomains can't be obtained - no PSL applicable
if strings.LastIndex(domain, ".") == len(domain)-1 {
return "", false, false
}
var buffer = subdomainPool.Get().([]subdomain)[:0]
var subdomains = decomposeDomain(domain, buffer)
defer subdomainPool.Put(subdomains)
var rulesInfo = load()
// the longest matching rule (the one with the most levels) will be used
for _, sub := range subdomains {
var rules, found = rulesInfo.Map[sub.name]
if !found {
continue
}
// Look for all the rules matching the concatenated name
for _, rule := range rules {
switch rule.RuleType {
case wildcard:
// first check if the rule is contained within the domain without the *.
if !strings.HasSuffix(sub.dottedName, rule.DottedName[2:]) {
continue
}
if len(domain) < len(rule.DottedName) {
// Handle corner case where the domain doesn't have a left side and a wildcard rule matches,
// i.e ".ck" with rule "*.ck" must return .ck as per golang implementation
if domain[0] == '.' && strings.Compare(domain, rule.DottedName[1:]) == 0 {
return domain, rule.ICANN, true
}
continue
}
var nbLevels = strings.Count(rule.DottedName, ".") + 1
var dot = len(domain) - 1
for i := 0; i < nbLevels && dot != -1; i++ {
dot = strings.LastIndex(domain[:dot], ".")
}
return domain[dot+1:], rule.ICANN, true
case exception:
// first check if the rule is contained within the domain without !
if !strings.HasSuffix(sub.dottedName, rule.DottedName[1:]) {
continue
}
var dot = strings.Index(rule.DottedName, ".")
return rule.DottedName[dot+1:], rule.ICANN, true
default:
// first check if the rule is contained within the domain
if !strings.HasSuffix(sub.dottedName, rule.DottedName) {
continue
}
return rule.DottedName, rule.ICANN, true
}
}
}
// If no rules match, the prevailing rule is "*".
var dot = strings.LastIndex(domain, ".")
return domain[dot+1:], false, false
}
// newList reads and parses r to create a new rulesInfo identified by release.
func newList(r io.Reader, release string) (*rulesInfo, error) {
var icann = false
var scanner = bufio.NewScanner(r)
var tempRulesMap = make(map[string][]rule)
var mapKey string
for scanner.Scan() {
var line = strings.TrimSpace(scanner.Text())
if strings.Contains(line, icannBegin) {
icann = true
continue
}
if strings.Contains(line, icannEnd) {
icann = false
continue
}
if line == "" || strings.HasPrefix(line, "//") {
continue
}
var err error
line, err = idna.ToASCII(line)
if err != nil {
return nil, fmt.Errorf("error while converting to ASCII %s: %s", line, err.Error())
}
if !validSuffixRE.MatchString(line) {
return nil, fmt.Errorf("bad publicsuffix.org list data: %q", line)
}
var rule = rule{ICANN: icann, DottedName: line}
var concatenatedLine = strings.Replace(line, ".", "", -1)
switch {
case strings.HasPrefix(concatenatedLine, "*"):
rule.RuleType = wildcard
mapKey = concatenatedLine[1:]
case strings.HasPrefix(concatenatedLine, "!"):
rule.RuleType = exception
mapKey = concatenatedLine[1:]
default:
rule.RuleType = normal
mapKey = concatenatedLine
}
tempRulesMap[mapKey] = append(tempRulesMap[mapKey], rule)
}
var tempRulesInfo = rulesInfo{Release: release, Map: tempRulesMap}
return &tempRulesInfo, nil
}
// decomposeDomain breaks domain down into a slice of labels.
func decomposeDomain(domain string, subdomains []subdomain) []subdomain {
var sub = subdomain{dottedName: domain, name: strings.Replace(domain, ".", "", -1)}
subdomains = append(subdomains, sub)
var name = domain
for {
var dot = strings.Index(name, ".")
if dot == -1 {
break
}
name = name[dot+1:]
var sub = subdomain{dottedName: name, name: strings.Replace(name, ".", "", -1)}
subdomains = append(subdomains, sub)
}
return subdomains
}