Skip to content

Commit c4a61cf

Browse files
committed
Supported: howie6879#5
1 parent f3a7434 commit c4a61cf

File tree

3 files changed

+50
-24
lines changed

3 files changed

+50
-24
lines changed

Examples/google_search.py

+32-13
Original file line numberDiff line numberDiff line change
@@ -15,24 +15,35 @@
1515
#################################################
1616

1717
PROXIES = [{
18-
'http': 'http://192.168.2.207:1080',
19-
'https': 'http://192.168.2.207:1080'
18+
'http': 'http://127.0.0.1:8118',
19+
'https': 'http://127.0.0.1:8118'
2020
}]
2121

2222
# Or MagicGoogle()
2323
mg = MagicGoogle(PROXIES)
2424

2525
# The first page of results
26-
result = mg.search_page(query='python')
27-
print(result)
26+
# result = mg.search_page(query='python')
27+
# print(result)
28+
#
29+
# time.sleep(random.randint(1, 5))
2830

29-
time.sleep(random.randint(1, 3))
31+
# Get {'title','url','text'}
32+
for i in mg.search(query='python', num=1, language='en'):
33+
pprint.pprint(i)
34+
35+
time.sleep(random.randint(1, 5))
36+
37+
# Output
38+
# {'text': 'The official home of the Python Programming Language.',
39+
# 'title': 'Welcome to Python .org',
40+
# 'url': 'https://www.python.org/'}
3041

31-
# Get url
42+
# Get first page
3243
for url in mg.search_url(query='python'):
3344
pprint.pprint(url)
3445

35-
time.sleep(random.randint(1, 3))
46+
time.sleep(random.randint(1, 5))
3647

3748
# Output
3849
# 'https://www.python.org/'
@@ -47,10 +58,18 @@
4758
# 'https://learnpythonthehardway.org/book/'
4859
# 'https://www.continuum.io/downloads'
4960

50-
# Get {'title','url','text'}
51-
for i in mg.search(query='python', num=1):
52-
pprint.pprint(i)
61+
# Get second page
62+
for url in mg.search_url(query='python', start=10):
63+
pprint.pprint(url)
64+
5365
# Output
54-
# {'text': 'The official home of the Python Programming Language.',
55-
# 'title': 'Welcome to Python .org',
56-
# 'url': 'https://www.python.org/'}
66+
# 'https://github.com/python'
67+
# 'https://github.com/python/cpython'
68+
# 'https://www.learnpython.org/'
69+
# 'https://www.raspberrypi.org/documentation/usage/python/'
70+
# 'https://www.reddit.com/r/Python/'
71+
# 'https://www.datacamp.com/courses/intro-to-python-for-data-science'
72+
# 'https://www.coursera.org/learn/python'
73+
# 'https://www.coursera.org/learn/interactive-python-1'
74+
# 'http://abcnews.go.com/US/record-breaking-17-foot-python-captured-south-florida/story?id=51616851'
75+
# 'https://hub.docker.com/_/python/'

MagicGoogle/magic_google.py

+17-10
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
import requests
88

99
from pyquery import PyQuery as pq
10-
from MagicGoogle.config import USER_AGENT, DOMAIN, BLACK_DOMAIN, URL_SEARCH, URL_NUM, LOGGER
10+
from MagicGoogle.config import USER_AGENT, DOMAIN, BLACK_DOMAIN, URL_SEARCH, URL_NEXT, URL_NUM, LOGGER
1111

1212
if sys.version_info[0] > 2:
1313
from urllib.parse import quote_plus, urlparse, parse_qs
@@ -24,7 +24,7 @@ class MagicGoogle():
2424
def __init__(self, proxies=None):
2525
self.proxies = random.choice(proxies) if proxies else None
2626

27-
def search(self, query, language='en', num=None, start=0, pause=2):
27+
def search(self, query, language=None, num=None, start=0, pause=2):
2828
"""
2929
Get the results you want,such as title,description,url
3030
:param query:
@@ -46,7 +46,7 @@ def search(self, query, language='en', num=None, start=0, pause=2):
4646
result['text'] = text
4747
yield result
4848

49-
def search_page(self, query, language='en', num=None, start=0, pause=2):
49+
def search_page(self, query, language=None, num=None, start=0, pause=2):
5050
"""
5151
Google search
5252
:param query: Keyword
@@ -55,14 +55,21 @@ def search_page(self, query, language='en', num=None, start=0, pause=2):
5555
"""
5656
time.sleep(pause)
5757
domain = self.get_random_domain()
58-
if num is None:
59-
url = URL_SEARCH
58+
if start > 0:
59+
url = URL_NEXT
6060
url = url.format(
61-
domain=domain, language=language, query=quote_plus(query))
61+
domain=domain, language=language, query=quote_plus(query), num=num, start=start)
6262
else:
63-
url = URL_NUM
64-
url = url.format(
65-
domain=domain, language=language, query=quote_plus(query), num=num)
63+
if num is None:
64+
url = URL_SEARCH
65+
url = url.format(
66+
domain=domain, language=language, query=quote_plus(query))
67+
else:
68+
url = URL_NUM
69+
url = url.format(
70+
domain=domain, language=language, query=quote_plus(query), num=num)
71+
if language is None:
72+
url = url.replace('hl=None&', '')
6673
# Add headers
6774
headers = {'user-agent': self.get_random_user_agent()}
6875
try:
@@ -82,7 +89,7 @@ def search_page(self, query, language='en', num=None, start=0, pause=2):
8289
LOGGER.exception(e)
8390
return None
8491

85-
def search_url(self, query, language='en', num=None, start=0, pause=2):
92+
def search_url(self, query, language=None, num=None, start=0, pause=2):
8693
"""
8794
:param query:
8895
:param language:

setup.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
name='MagicGoogle',
66
version='0.2.7',
77
description="A google search results crawler",
8-
install_requires=['pyquery>=1.2.17', 'requests>=2.12.4', 'chardet>=2.3.0'],
8+
install_requires=['pyquery>=1.2.17', 'requests>=2.12.4', 'cchardet'],
99
author='Howie Hu',
1010
author_email='[email protected]',
1111
url="https://github.com/howie6879/MagicGoogle/blob/master/README.md",

0 commit comments

Comments
 (0)