|
| 1 | +# !/usr/bin/env python |
| 2 | +# -*- coding:utf-8 -*- |
| 3 | +__author__ = 'bit4' |
| 4 | +__github__ = 'https://github.com/bit4woo' |
| 5 | + |
| 6 | +from lib import myparser |
| 7 | +from lib.log import logger |
| 8 | +import time |
| 9 | +from lib import myrequests |
| 10 | +req = myrequests |
| 11 | + |
| 12 | +class search_sogou: |
| 13 | + |
| 14 | + def __init__(self, word, limit, proxy=None): |
| 15 | + self.engine_name ="SoGou" |
| 16 | + self.word = word |
| 17 | + self.limit = int(limit) |
| 18 | + self.results = "" |
| 19 | + self.totalresults = "" |
| 20 | + self.proxies = proxy |
| 21 | + self.server = "www.sogou.com" |
| 22 | + self.counter = 0 # |
| 23 | + self.print_banner() |
| 24 | + return |
| 25 | + |
| 26 | + def print_banner(self): |
| 27 | + logger.info("Searching now in {0}..".format(self.engine_name)) |
| 28 | + return |
| 29 | + |
| 30 | + def do_search(self): |
| 31 | + try: |
| 32 | + #http://www.sogou.com/web?query=xxxx&page=2&ie=utf8 |
| 33 | + url = "http://{0}/web?query={1}&page={2}".format(self.server,self.word,self.counter)# 这里的pn参数是条目数 |
| 34 | + r = req.get(url, proxies = self.proxies) |
| 35 | + self.results = r.content |
| 36 | + self.totalresults += self.results |
| 37 | + return True |
| 38 | + except Exception, e: |
| 39 | + logger.error("Error in {0}: {1}".format(__file__.split('/')[-1],e)) |
| 40 | + return False |
| 41 | + |
| 42 | + def process(self): |
| 43 | + while self.counter <= self.limit and self.counter <= 1000: |
| 44 | + if self.do_search(): |
| 45 | + time.sleep(1) |
| 46 | + #print "\tSearching " + str(self.counter) + " results..." |
| 47 | + self.counter += 10 |
| 48 | + continue |
| 49 | + else: |
| 50 | + break |
| 51 | + |
| 52 | + def get_emails(self): |
| 53 | + rawres = myparser.parser(self.totalresults, self.word) |
| 54 | + #print "%s email(s) found in Baidu" %len(rawres.emails()) |
| 55 | + return rawres.emails() |
| 56 | + |
| 57 | + def get_hostnames(self): |
| 58 | + rawres = myparser.parser(self.totalresults, self.word) |
| 59 | + #print "%s domain(s) found in Baidu" %len(rawres.hostnames()) |
| 60 | + return rawres.hostnames() |
| 61 | + def run(self): # define this function,use for threading, define here or define in child-class both should be OK |
| 62 | + self.process() |
| 63 | + self.d = self.get_hostnames() |
| 64 | + self.e = self.get_emails() |
| 65 | + logger.info("{0} found {1} domain(s) and {2} email(s)".format(self.engine_name,len(self.d),len(self.e))) |
| 66 | + return self.d, self.e |
| 67 | + |
| 68 | + |
| 69 | +if __name__ == "__main__": |
| 70 | + useragent = "(Mozilla/5.0 (Windows; U; Windows NT 6.0;en-US; rv:1.9.2) Gecko/20100115 Firefox/3.6" |
| 71 | + proxy = {"http":"http://127.0.0.1:8080"} |
| 72 | + search = search_sogou("meizu.com", '100') |
| 73 | + print search.run() |
0 commit comments