爬虫实战-分布式微博爬取数据
分布式微博爬取一,登录登录的接口,PC:https://weibo.com/,https://weibo.cn, 移动:https://m.weibo.cn/写在中间键里面添加cookies,headers等数据自行写一个登陆脚本单独写一个登录的脚本,用scrapy进行爬取模拟登陆脚本(能获取到登录成功的cookies),自己创建一个文件# !/usr/bin/env python# _*_ co
·
分布式微博爬取
一,登录
登录的接口,PC:https://weibo.com/,https://weibo.cn, 移动:https://m.weibo.cn/
-
写在中间键里面
添加cookies,headers等数据
-
自行写一个登陆脚本
单独写一个登录的脚本,用scrapy进行爬取
-
模拟登陆脚本(能获取到登录成功的cookies),自己创建一个文件
# !/usr/bin/env python # _*_ coding:utf-8 _*_ # author:满怀心 2019/7/15 20:42 """ # code is far away from bugs with the god animal protecting I love animals. They taste delicious. ┏┓ ┏┓ ┏┛┻━━━┛┻┓ ┃ ☃ ┃ ┃ ┳┛ ┗┳ ┃ ┃ ┻ ┃ ┗━┓ ┏━┛ ┃ ┗━━━┓ ┃ 神兽保佑 ┣┓ ┃ 永无BUG! ┏┛ ┗┓┓┏━┳┓┏┛ ┃┫┫ ┃┫┫ ┗┻┛ ┗┻┛ """ import requests import re import json class Weibo_login(object): def __init__(self, user_name, password): self.user_name = user_name self.password = password self.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36', 'Content-Type': 'application/x-www-form-urlencoded', 'Referer': 'https://passport.weibo.cn/signin/login?entry=mweibo&r=https%3A%2F%2Fweibo.cn%2F%3Fluicode%3D10000011%26lfid%3D102803&backTitle=%CE%A2%B2%A9&vt=' } self.login_url = 'https://passport.weibo.cn/sso/login' self.session = requests.session() def login(self): data = { 'username': self.user_name, 'password': self.password, 'savestate': 1, 'r': 'https://weibo.cn/', 'ec': 0, 'pagerefer': 'https://weibo.cn/pub/', 'entry': 'mweibo', 'wentry':'', 'loginfrom':'', 'client_id':'', 'code': '', 'qq': '', 'mainpageflag': 1, 'hff': '', 'hfp':'', } self.session.post(self.login_url, data=data, headers=self.headers) return self.alert_login() def alert_login(self): html = self.session.get('https://weibo.cn').text username = re.findall(r'<div class="ut">(.*?)<a', html, re.S) cookies = {} # 拿到 cookies 数据 if username: print('登陆成功, 当前登录用户:{}'.format(username[0])) for i in self.session.cookies.items(): cookies[i[0]] = i[1] return json.dumps(cookies) # def save_cookies(self, cookies=None): # 进行大量账号模拟登陆获取cookies储存到redis里面 weibo_user = [ ('13586971744', '13738939057') ] def get_cookies(username, password): weibo = Weibo_login(username, password) cookies = weibo.login() return cookies def initCookies(conn): """ 对所有的账号进行模拟登陆获取cookies数据 :param conn: :return: """ for user in weibo_user: if conn.get('sina:Cookies:{}'.format(user[0])) is None: cookies = get_cookies(user[0], user[1]) if cookies: conn.set('sina:Cookies:{}'.format(user[0]), cookies) else: print('账号{}的cookies登录数据已存入cookies'.format(user[0])) if __name__ == '__main__': weibo = Weibo_login(13586971744, 13738939057) weibo.login()
-
下载中间件
class CookieMiddleware(object): def __init__(self): """ 建立redis连接,进行cookies的初始化 """ self.conn = redis.Redis('127.0.0.1', 6379) initCookies(self.conn) def process_request(self, request, spider): """ 在request请求里面添加cookies数据 :param request: :param spider: :return: """ redis_keys = self.conn.keys() # 获取redis里面所有的key while len(redis_keys) > 0: cookies = choice(redis_keys) if "Cookies" in cookies.decode(): cookie = json.loads(self.conn.get(cookies)) request.cookies = cookie break else: redis_keys.remove(cookies) # 删除redis_keys这个list里面的内容
-
运行流程
1.scrapy启动后,调用cookies中间键 2.在中间键里面,首先进行cookies初始化
3.在redis里面获取cookies,添加到request上面
-
更多推荐
已为社区贡献3条内容
所有评论(0)