-
Notifications
You must be signed in to change notification settings - Fork 146
/
Copy pathspider.go
109 lines (102 loc) · 3.99 KB
/
spider.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
//go语言打开浏览器
package main
import (
"encoding/xml"
"fmt"
"io/ioutil"
"math/rand"
"net/http"
"regexp"
"runtime"
"strings"
"time"
)
var urlChannel = make(chan string, 200) //chan中存入string类型的href属性,缓冲200
var atagRegExp = regexp.MustCompile(`<a[^>]+[(href)|(HREF)]\s*\t*\n*=\s*\t*\n*[(".+")|('.+')][^>]*>[^<]*</a>`) //以Must前缀的方法或函数都是必须保证一定能执行成功的,否则将引发一次panic
var userAgent = [...]string{"Mozilla/5.0 (compatible, MSIE 10.0, Windows NT, DigExt)",
"Mozilla/4.0 (compatible, MSIE 7.0, Windows NT 5.1, 360SE)",
"Mozilla/4.0 (compatible, MSIE 8.0, Windows NT 6.0, Trident/4.0)",
"Mozilla/5.0 (compatible, MSIE 9.0, Windows NT 6.1, Trident/5.0,",
"Opera/9.80 (Windows NT 6.1, U, en) Presto/2.8.131 Version/11.11",
"Mozilla/4.0 (compatible, MSIE 7.0, Windows NT 5.1, TencentTraveler 4.0)",
"Mozilla/5.0 (Windows, U, Windows NT 6.1, en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
"Mozilla/5.0 (Macintosh, Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
"Mozilla/5.0 (Macintosh, U, Intel Mac OS X 10_6_8, en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
"Mozilla/5.0 (Linux, U, Android 3.0, en-us, Xoom Build/HRI39) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13",
"Mozilla/5.0 (iPad, U, CPU OS 4_3_3 like Mac OS X, en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
"Mozilla/4.0 (compatible, MSIE 7.0, Windows NT 5.1, Trident/4.0, SE 2.X MetaSr 1.0, SE 2.X MetaSr 1.0, .NET CLR 2.0.50727, SE 2.X MetaSr 1.0)",
"Mozilla/5.0 (iPhone, U, CPU iPhone OS 4_3_3 like Mac OS X, en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
"MQQBrowser/26 Mozilla/5.0 (Linux, U, Android 2.3.7, zh-cn, MB200 Build/GRJ22, CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1"}
var r = rand.New(rand.NewSource(time.Now().UnixNano()))
func main() {
//go Spy("https://www.baidu.com")
go Spy("https://blog.csdn.net/guyan0319/article/details/90450958")
for url := range urlChannel {
fmt.Println("routines num = ", runtime.NumGoroutine(), "chan len = ", len(urlChannel)) //通过runtime可以获取当前运行时的一些相关参数等
//go Spy(url)
fmt.Println(url)
}
fmt.Println("a")
}
func Spy(url string) {
defer func() {
if r := recover(); r != nil {
fmt.Println("[E]", r)
}
}()
req, _ := http.NewRequest("GET", url, nil)
req.Header.Set("User-Agent", GetRandomUserAgent())
client := http.DefaultClient
res, e := client.Do(req)
if e != nil {
fmt.Errorf("Get请求%s返回错误:%s", url, e)
return
}
if res.StatusCode == 200 {
body := res.Body
defer body.Close()
bodyByte, _ := ioutil.ReadAll(body)
resStr := string(bodyByte)
fmt.Println(resStr)
atag := atagRegExp.FindAllString(resStr, -1)
for _, a := range atag {
href, _ := GetHref(a)
if strings.Contains(href, "article/details/") {
fmt.Println("☆", href)
} else {
fmt.Println("□", href)
}
fmt.Println(href)
urlChannel <- href
}
}
}
func GetRandomUserAgent() string {
return userAgent[r.Intn(len(userAgent))]
}
func GetHref(atag string) (href, content string) {
inputReader := strings.NewReader(atag)
decoder := xml.NewDecoder(inputReader)
for t, err := decoder.Token(); err == nil; t, err = decoder.Token() {
switch token := t.(type) {
// 处理元素开始(标签)
case xml.StartElement:
for _, attr := range token.Attr {
attrName := attr.Name.Local
attrValue := attr.Value
if strings.EqualFold(attrName, "href") || strings.EqualFold(attrName, "HREF") {
href = attrValue
}
}
// 处理元素结束(标签)
case xml.EndElement:
// 处理字符数据(这里就是元素的文本)
case xml.CharData:
content = string([]byte(token))
default:
href = ""
content = ""
}
}
return href, content
}