Python爬虫抓取豆瓣电影Top250
# !/usr/bin/python# Filename: 实战:爬取豆瓣 1.py# Data: 2020/05/28# Author: --king--import requestsfrom bs4 import BeautifulSoup# import time# import random# 1.目标网站豆瓣电影Top250:https://movie.douban.com/top250
·
# !/usr/bin/python
# Filename: 实战:爬取豆瓣 1.py
# Data : 2020/05/28
# Author : --king--
import requests
from bs4 import BeautifulSoup
# import time
# import random
# 1.目标网站豆瓣电影Top250:https://movie.douban.com/top250
# 创建requests要用到的headers和cookies头,做字典''处理
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36','Cookie': 'bid=fN2LagmkNt4; __utmc=30149280; __utmz=30149280.1590661612.1.1.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utmc=223695111; __utmz=223695111.1590661612.1.1.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; ll="118237"; _vwo_uuid_v2=DAB0E009D91A5997EB68193FF4CD1D7D1|3484ac5931eb6b7a1ed4556ae8b097a0; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1590672349%2C%22https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3DFAQOqIH8Q2MLgMi_b6fmOeaXr--EdxKk8-bXMOREZQGbX8qIQwITej9KSO8Ydwc4%26wd%3D%26eqid%3Dbbf8d42c0002bd5a000000025ecf91e8%22%5D; ap_v=0,6.0; __utma=30149280.1656049617.1590661612.1590661612.1590672349.2; __utma=223695111.916830186.1590661612.1590661612.1590672349.2; ct=y; _pk_id.100001.4cf6=db8ef939ce9eac77.1590661612.2.1590672541.1590661755.'}
# 构造获取detail_url的函数
def get_detail_urls(url):
resp = requests.get(url, headers=headers)
# 打印检查是否能正常获取url内容
# print(htmls.text)
htmls = resp.text
# 获取详情detail_url
soup = BeautifulSoup(htmls, 'lxml')
# F12中ctrl+f查询ol标签对应class属性是否唯一
# 因为class是系统关键字,所以标签属性下加下划线_
# find_all返回是个列表,可能要切片
# find返回是字符串,所以不用切片了,所以用find找ol
# ol = soup.find('ol',class_ = 'grid_view')
# lis = ol.find_all('li')
# 语句合并一下
lis = soup.find('ol', class_='grid_view').find_all('li')
# print(lis)
# lis也是列表,要循环处理才能取出想要的值
# 创建空列表,把返回值存放到detail_urls中
detail_urls = []
for li in lis:
# 发现a标签里边有重名的,所以用find取回第一个值即可
detail_url = li.find('a')['href']
# print(detail_url)
detail_urls.append(detail_url)
return detail_urls
def parse_detail_url(url, f):
# 解析详情页面的内容
resp = requests.get(url, headers=headers)
htmls = resp.text
soup = BeautifulSoup(htmls, 'lxml')
# 1.获取电影名和上映年份,返回是字符串
# titles = soup.find('div',id='content').find('h1').text
# 利用.stripped_strings去多余的空格和换行,返回元素以列表list()形式展示
titles = list(soup.find('div', id='content').find('h1').stripped_strings)
# print(titles)
# 把列表里的两个字符串粘合在一起
titles = (' - ').join(titles)
# print(titles)
# 2.获取导演
directors = list(soup.find('div', id='info').find('span').find('span', class_='attrs').stripped_strings)
# print(directors)
# 3.获得编剧
screenwriters = list(soup.find('div', id='info').find_all('span')[3].find('span', class_='attrs').stripped_strings)
screenwriters = ('').join(screenwriters)
# print(screenwriters)
# 4.主演
actors = list(soup.find('span', class_='actor').find('span', class_='attrs').stripped_strings)
actors = ('').join(actors)
# print(actors)
# 5.评分
scores = list(soup.find('strong', class_='ll rating_num').string)
scores = ('').join(scores)
# print(scores)
# format中的内容需要是str,如果是列表,则需要通过('').join()转化为字符串
f.write('{},{},{},{},{}\n'.format(titles, ('').join(directors), screenwriters, actors, scores))
# 构造主函数
def main():
base_url = 'https://movie.douban.com/top250?start={}'
# 千万别把写入函数写在循环中
# 把f传参进入需要写入具体信息的函数
with open('Top250.csv', 'a', encoding='utf-8') as f:
# 为了抓取全站数据,需要修改原始url的地址,每页都是(页数-1)×25,总共10页
# 注意这个循环不要卸载with open前边,避免反复打开文件
# range(开始,结束,步长),注意结束并不包含,一般需要+1
# for i in range(0,226,25):
# url = base_url.format(i)
for i in range(10):
url = base_url.format(25 * i)
# for循环会自动+1,不用i+=1了
# print(url)
detail_urls = get_detail_urls(url)
# print(detail_urls)
for detail_url in detail_urls:
# 增加延时
# time.sleep(random.randint(1, 2))
# 增加f传参进入parse_detail_url函数
parse_detail_url(detail_url, f)
# 调用主函数
# 执行发现在200页以后返回空值报错,所以加入延时,防止页面加载未完成炸不到数据
if __name__ == '__main__':
main()
更多推荐



所有评论(0)