Python 3获取360搜索结果的链接
#!/usr/bin/python# coding:utf8import reimport requestsimport sysimport getoptfrom bs4 import BeautifulSoupfrom urllib.parse import quotefrom time import sleepimport timeimport randomclass...
·
#!/usr/bin/python
# coding:utf8
import re
import requests
import sys
import getopt
from bs4 import BeautifulSoup
from urllib.parse import quote
from time import sleep
import time
import random
class crawler:
'''爬360搜索结果的爬虫'''
url = ''
#去重
urls = set()
html = ''
total_pages = 5
current_page = 0
next_page_url = ''
timeout = 60
p1 = 0
i1 = 1
headersParameters = { #发送HTTP请求时的HEAD信息,用于伪装为浏览器
'Connection': 'Keep-Alive',
'Accept': 'text/html, application/xhtml+xml, */*',
'Accept-Language': 'en-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3',
'Accept-Encoding': 'gzip, deflate',
'User-Agent': 'Mozilla/6.1 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko'
}
def __init__(self, keyword):
print("搜索关键词:" + keyword)
print("正在获取网页链接中......")
self.url = u'https://www.so.com/s?ie=utf-8&fr=360portal&src=home_www&q=' + quote(keyword)
def set_timeout(self, time):
'''设置超时时间,单位:秒'''
try:
self.timeout = int(time)
except:
pass
def set_total_pages(self, num):
'''设置总共要爬取的页数'''
try:
self.total_pages = int(num)
self.p1 = int(100 / self.total_pages)
except:
pass
def set_current_url(self, url):
'''设置当前url'''
self.url = url
def switch_url(self):
'''切换当前url为下一页的url
若下一页为空,则退出程序'''
if self.next_page_url == '':
sys.exit()
else:
self.set_current_url(self.next_page_url)
def is_finish(self):
'''判断是否爬取完毕'''
if self.current_page >= self.total_pages:
return True
else:
return False
def get_html(self):
'''爬取当前url所指页面的内容,保存到html中'''
#发送网络请求,如果连接失败,延时5秒,无限重试链接
success = False
while(success == False):
try:
#发送网络请求
r = requests.get(self.url ,timeout=self.timeout, headers=self.headersParameters)
except requests.exceptions.ConnectionError as e:
sleep(5)
else:
success = True
if r.status_code == 200:
self.html = r.text
self.current_page += 1
else:
self.html = u''
print('[ERROR]',self.url,u'get此url返回的http状态码不是200')
def get_urls(self):
'''从当前html中解析出搜索结果的url,保存到o_urls'''
bsObj = BeautifulSoup(self.html,"html.parser")
list_h3 = bsObj.find_all("h3","res-title ")
for h3 in list_h3:
if "data-url" in h3.a.attrs:
self.urls.add(h3.a.attrs["data-url"])
else:
self.urls.add(h3.a.attrs["href"])
#取下一页地址
next = re.findall(' href\=\"(\/s\?q\=[\w\d\%\&\=\_\-]*?)\"', self.html)
if len(next) > 0:
self.next_page_url = 'https://www.so.com' + next[-1]
else:
self.next_page_url = ''
def print_urls(self):
'''输出当前urls中的url'''
for url in self.urls:
print(url)
def run(self):
while(not self.is_finish()):
c.get_html()
c.get_urls()
c.switch_url()
print(str(self.p1 * self.i1) + " %")
if not self.is_finish():
#随机延时
time.sleep(random.randint(6,20))
self.i1+=1
if self.p1 * self.i1 < 100:
print("100 %")
c.print_urls()
print("完毕......")
if __name__ == '__main__':
help = '360_crawler.py -k <keyword> [-t <timeout> -p <total pages>]'
keyword = None
timeout = None
totalpages = None
try:
opts, args = getopt.getopt(sys.argv[1:], "hk:t:p:")
except getopt.GetoptError:
print(help)
sys.exit(2)
#解析命令行参数
for opt, arg in opts:
if opt == '-h':
print(help)
sys.exit()
elif opt in ("-k", "--keyword"):
keyword = arg
elif opt in ("-t", "--timeout"):
timeout = arg
elif opt in ("-p", "--totalpages"):
totalpages = arg
if keyword == None:
print(help)
sys.exit()
c = crawler(keyword)
if timeout != None:
print('网站连接超时时间:' + timeout + '秒')
c.set_timeout(timeout)
if totalpages != None:
print('获取' + totalpages + '个搜索结果页面')
c.set_total_pages(totalpages)
print("0 %")
c.run()
更多推荐
所有评论(0)