用scrapy获取电影网站的链接

思路：使用scrapy新建一个工程，从主页开始，根据电影分类获取相应的url，进入分类页面后遍历具体电影，获取电影独立的url，然后获取下载地址．具体代码：首先关闭robots　选项：vim naika/settings.pyROBOTSTXT_OBEY = False#!/usr/bin/python#-*- coding:utf-8 -*-impo

hanglinux

1378人浏览 · 2017-07-09 23:08:46

hanglinux · 2017-07-09 23:08:46 发布

思路：

使用scrapy新建一个工程，从主页开始，根据电影分类获取相应的url，进入分类页面后遍历具体电影，获取电影独立的url，然后获取下载地址．

具体代码：

首先关闭robots　选项：

vim naika/settings.py

ROBOTSTXT_OBEY = False

#!/usr/bin/python
#-*- coding:utf-8 -*-

import scrapy
from ..items import NaikaItem

class MyFirst(scrapy.Spider):
    name = "ck180"
    start_urls = ['http://www.ck180.com/']

    def parse(self, response):
        for naika in response.xpath('//li[@class="menu-item menu-item-type-taxonomy menu-item-object-category"]\
                                     /a[@class="link_nav"]/@href'):
            #print ('-----************-------')
            tmp_link = naika.extract()
            if tmp_link is not None:
                category_link = response.urljoin(tmp_link) #connect link
                yield scrapy.Request(category_link, callback = self.get_movie) #新链接的解析函数

    def get_movie(self, response):
        print("get_movie")
        movie_links = response.xpath('//h3[@class="p-meta-title"]/a/@href').extract()
        for movie_link in movie_links:
            yield scrapy.Request(movie_link, callback = self.parse_movie)
            #print(movie_link)

    def parse_movie(self, response):
        headers = {
            "Accept" : "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
            "Accept-Encoding" : "gzip, deflate",
            "Accept-Language" : "en-US,en;q=0.8,zh-CN;q=0.6,zh;q=0.4",
            "Cache-Control" : "max-age=0",
            "Connection" : "keep-alive",
            "Cookie" : "Hm_lvt_a3f11a63cb3107860ad959bdacb6c6c8=1499587389; Hm_lpvt_a3f11a63cb3107860ad959bdacb6c6c8=1499607072",
            "DNT" : 1,
            "Host" : "www.ck180.com",
            "Referer" : "",
            "Upgrade-Insecure-Requests" : 1,
            "User-Agent" : "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36",
        }

        down_link = response.xpath('//div[@class="download_dz"]/a[@class="download-link"]/@href').extract()
        self_link = response.xpath('/html/head/link[4]/@href').extract()
        if down_link is not None:
            headers['Referer'] = str(self_link[0])
            yield scrapy.Request(down_link[0], callback = self.get_magnet, headers = headers)
            #print("parse_movie")

    def get_magnet(self, response):
        movie_item = NaikaItem()
        print(response.text)

        movie_name = response.xpath('//*[@id="main-container"]/div/div[1]/div[3]/h3/a/text()').extract()
        movie_magnet = response.xpath('//*[@id="download"]/div/a[2]/@href').extract()
        print(response.xpath('//*[@id="download"]/div/a[1]/@href').extract())
        print(str(movie_name))
        print(movie_magnet)
        movie_item['movie_name'] = movie_name
        movie_item['magnet'] = movie_magnet
        yield movie_item