用scrapy获取电影网站的链接
思路:使用scrapy新建一个工程,从主页开始,根据电影分类获取相应的url,进入分类页面后遍历具体电影,获取电影独立的url,然后获取下载地址.具体代码:首先关闭robots 选项:vim naika/settings.pyROBOTSTXT_OBEY = False#!/usr/bin/python#-*- coding:utf-8 -*-impo
·
思路:
使用scrapy新建一个工程,从主页开始,根据电影分类获取相应的url,进入分类页面后遍历具体电影,获取电影独立的url,然后获取下载地址.
具体代码:
首先关闭robots 选项:
vim naika/settings.py
ROBOTSTXT_OBEY = False
#!/usr/bin/python
#-*- coding:utf-8 -*-
import scrapy
from ..items import NaikaItem
class MyFirst(scrapy.Spider):
name = "ck180"
start_urls = ['http://www.ck180.com/']
def parse(self, response):
for naika in response.xpath('//li[@class="menu-item menu-item-type-taxonomy menu-item-object-category"]\
/a[@class="link_nav"]/@href'):
#print ('-----************-------')
tmp_link = naika.extract()
if tmp_link is not None:
category_link = response.urljoin(tmp_link) #connect link
yield scrapy.Request(category_link, callback = self.get_movie) #新链接的解析函数
def get_movie(self, response):
print("get_movie")
movie_links = response.xpath('//h3[@class="p-meta-title"]/a/@href').extract()
for movie_link in movie_links:
yield scrapy.Request(movie_link, callback = self.parse_movie)
#print(movie_link)
def parse_movie(self, response):
headers = {
"Accept" : "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
"Accept-Encoding" : "gzip, deflate",
"Accept-Language" : "en-US,en;q=0.8,zh-CN;q=0.6,zh;q=0.4",
"Cache-Control" : "max-age=0",
"Connection" : "keep-alive",
"Cookie" : "Hm_lvt_a3f11a63cb3107860ad959bdacb6c6c8=1499587389; Hm_lpvt_a3f11a63cb3107860ad959bdacb6c6c8=1499607072",
"DNT" : 1,
"Host" : "www.ck180.com",
"Referer" : "",
"Upgrade-Insecure-Requests" : 1,
"User-Agent" : "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36",
}
down_link = response.xpath('//div[@class="download_dz"]/a[@class="download-link"]/@href').extract()
self_link = response.xpath('/html/head/link[4]/@href').extract()
if down_link is not None:
headers['Referer'] = str(self_link[0])
yield scrapy.Request(down_link[0], callback = self.get_magnet, headers = headers)
#print("parse_movie")
def get_magnet(self, response):
movie_item = NaikaItem()
print(response.text)
movie_name = response.xpath('//*[@id="main-container"]/div/div[1]/div[3]/h3/a/text()').extract()
movie_magnet = response.xpath('//*[@id="download"]/div/a[2]/@href').extract()
print(response.xpath('//*[@id="download"]/div/a[1]/@href').extract())
print(str(movie_name))
print(movie_magnet)
movie_item['movie_name'] = movie_name
movie_item['magnet'] = movie_magnet
yield movie_item
更多推荐
已为社区贡献1条内容
所有评论(0)