#!/usr/bin/python
# coding:utf8

import re
import requests
import sys
import getopt
from bs4 import BeautifulSoup
from urllib.parse import quote
from time import sleep
import time
import random

class crawler:
    '''爬360搜索结果的爬虫'''
    url = ''
    #去重
    urls = set()
    html = ''
    total_pages = 5
    current_page = 0
    next_page_url = ''
    timeout = 60
    p1 = 0
    i1 = 1
    
    headersParameters = {    #发送HTTP请求时的HEAD信息,用于伪装为浏览器
        'Connection': 'Keep-Alive',
        'Accept': 'text/html, application/xhtml+xml, */*',
        'Accept-Language': 'en-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3',
        'Accept-Encoding': 'gzip, deflate',
        'User-Agent': 'Mozilla/6.1 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko'
    }

    def __init__(self, keyword):
        print("搜索关键词:" + keyword)
        print("正在获取网页链接中......")
        self.url = u'https://www.so.com/s?ie=utf-8&fr=360portal&src=home_www&q=' + quote(keyword)

    def set_timeout(self, time):
        '''设置超时时间,单位:秒'''
        try:
            self.timeout = int(time)
        except:
            pass

    def set_total_pages(self, num):
        '''设置总共要爬取的页数'''
        try:
            self.total_pages = int(num)
            self.p1 = int(100 / self.total_pages)
        except:
            pass

    def set_current_url(self, url):
        '''设置当前url'''
        self.url = url

    def switch_url(self):
        '''切换当前url为下一页的url
           若下一页为空,则退出程序'''
        if self.next_page_url == '':
            sys.exit()
        else:
            self.set_current_url(self.next_page_url)

    def is_finish(self):
        '''判断是否爬取完毕'''
        if self.current_page >= self.total_pages:
            return True
        else:
            return False

    def get_html(self):
        '''爬取当前url所指页面的内容,保存到html中'''
        #发送网络请求,如果连接失败,延时5秒,无限重试链接
        success = False
        while(success == False):
            try:
                #发送网络请求
                r = requests.get(self.url ,timeout=self.timeout, headers=self.headersParameters)
            except requests.exceptions.ConnectionError as e:
                sleep(5)
            else:
                success = True
        if r.status_code == 200:
            self.html = r.text
            self.current_page += 1
        else:
            self.html = u''
            print('[ERROR]',self.url,u'get此url返回的http状态码不是200')

    def get_urls(self):
        '''从当前html中解析出搜索结果的url,保存到o_urls'''
        bsObj = BeautifulSoup(self.html,"html.parser")
        list_h3 = bsObj.find_all("h3","res-title ")
        for h3 in list_h3:
            if "data-url" in h3.a.attrs:
                self.urls.add(h3.a.attrs["data-url"])
            else:
               self.urls.add(h3.a.attrs["href"])
        #取下一页地址
        next = re.findall(' href\=\"(\/s\?q\=[\w\d\%\&\=\_\-]*?)\"', self.html)
        if len(next) > 0:
            self.next_page_url = 'https://www.so.com' + next[-1]
        else:
            self.next_page_url = ''

    def print_urls(self):
        '''输出当前urls中的url'''
        for url in self.urls:
            print(url)

    def run(self):
        while(not self.is_finish()):
            c.get_html()
            c.get_urls()
            c.switch_url()
            print(str(self.p1 * self.i1) + " %")
            if not self.is_finish():
                #随机延时
                time.sleep(random.randint(6,20))
                self.i1+=1
        if self.p1 * self.i1 < 100:
            print("100 %")
        c.print_urls()
        print("完毕......")

if __name__ == '__main__':
    help = '360_crawler.py -k <keyword> [-t <timeout> -p <total pages>]'
    keyword = None
    timeout = None
    totalpages = None
    
    try:
        opts, args = getopt.getopt(sys.argv[1:], "hk:t:p:")
    except getopt.GetoptError:
        print(help)
        sys.exit(2)
    #解析命令行参数
    for opt, arg in opts:
        if opt == '-h':
            print(help)
            sys.exit()
        elif opt in ("-k", "--keyword"):
            keyword = arg
        elif opt in ("-t", "--timeout"):
            timeout = arg
        elif opt in ("-p", "--totalpages"):
            totalpages = arg
    if keyword == None:
        print(help)
        sys.exit()

    c = crawler(keyword)
    if timeout != None:
        print('网站连接超时时间:' + timeout + '秒')
        c.set_timeout(timeout)
    if totalpages != None:
        print('获取' + totalpages + '个搜索结果页面')
        c.set_total_pages(totalpages)
    print("0 %")
    c.run()

Logo

CSDN联合极客时间,共同打造面向开发者的精品内容学习社区,助力成长!

更多推荐