python爬取段子
糗事百科段子爬取https://www.qiushibaike.com/text/page/1/selenium和requests和lxml两种方法#!usr/bin/python# -*- coding:utf8 -*-import timefrom selenium import webdriverclass QiuBaiSpider(object):def __init__(self):se
·
糗事百科段子爬取
https://www.qiushibaike.com/text/page/1/
selenium和requests和lxml两种方法
#!usr/bin/python
# -*- coding:utf8 -*-
import time
from selenium import webdriver
class QiuBaiSpider(object):
def __init__(self):
self.url = 'https://www.qiushibaike.com/text/page/1/'
self.driver = webdriver.Chrome('./chromedriver.exe')
def parse_data(self):
# 对数据进行分组
div_list = self.driver.find_elements_by_xpath("//div[@class='col1 old-style-col1']/div")
content_list = []
for div in div_list:
item = dict()
item['username'] = div.find_element_by_xpath('.//h2').text
item['content'] = div.find_element_by_xpath('.//div[@class="content"]/span').text
print(item)
content_list.append(item)
return content_list
def save_data(self, content_list):
for content in content_list:
print(content)
def run(self):
self.driver.get(self.url)
content_list = self.parse_data()
self.save_data(content_list)
while True:
next_url = self.driver.find_elements_by_xpath('//span[@class="next"]')
if len(next_url) > 0:
next_url[0].click()
content_list = self.parse_data()
self.save_data(content_list)
else:
break
if __name__ == '__main__':
spider = QiuBaiSpider()
spider.run()
#!usr/bin/python
# -*- coding:utf8 -*-
"""
糗事百科段子爬取
https://www.qiushibaike.com/text/page/1/
"""
import requests
from lxml import etree
class QiuBaiSpider(object):
def __init__(self):
self.base_url = 'https://www.qiushibaike.com/text/page/{}/'
self.headers = {"User-Agent": "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)"}
def get_url_list(self):
"""
生成13个url地址的方法
"""
url_list = [self.base_url.format(i) for i in range(1, 14)]
return url_list
def send_request(self, url):
"""
发送请求获取响应的方法
"""
response = requests.get(url, headers=self.headers)
return response.content
def parse_data(self, response):
"""
解析数据的方法
:param response: 响应的内容
:return: 数据
"""
# 将响应的内容转换成element对象
element = etree.HTML(response)
# 对数据进行分组
div_list = element.xpath("//div[@class='col1 old-style-col1']/div")
data_list = []
for div in div_list:
item = dict()
item['author'] = div.xpath('.//h2/text()')[0].strip()
item['content'] = div.xpath('.//div[@class="content"]/span/text()')
data_list.append(item)
print(item)
return data_list
def run(self):
# 1. 初始化url地址, 准备13个url地址
url_list = self.get_url_list()
# 2. 遍历这个url地址 发送请求 获取响应
for url in url_list:
response = self.send_request(url)
# 3. 提取数据
self.parse_data(response)
# 4. 保存数据
if __name__ == '__main__':
spider = QiuBaiSpider()
spider.run()
更多推荐



所有评论(0)