最近刚学完基础部分,写个爬虫充实一下自己。

#!/usr/bin/python
#coding:utf-8
'''
爬某小说网站
'''

import urllib2

import re

from bs4 import BeautifulSoup

#根据指定url获取服务器端响应
def OpenPage(url):
    Myheaders = {}
    #urllib2.Request 构造请求
    req = urllib2.Request(url,headers=Myheaders)
    #相当于,在浏览器的地址栏,输入了网址
    #激活请求,获取响应
    f = urllib2.urlopen(req)
    #服务器端响应类文本对象,通过read()方法读取响应内容
    data = f.read()
    #三种处理方式ignore replace xml..replace
    return data.decode("GBK",errors="ignore").encode("utf-8")

def Test1():
    url = "http://www.shengxu6.com/book/2967.html"
    print OpenPage(url)

#从主页解析数据,获取各个章节的跳转链接url
def ParseMainPage(page):
    #调用爬虫库提供的相关方法,进行数据分析
    #html.parser python自带的html解析引擎
    soup = BeautifulSoup(page,"html.parser")
    #find_all方法在全文内容里搜索符合内容的标签,返回一个列表
    #检索所有的href属性值中包含read字符串的标签
    GetA = soup.find_all(href=re.compile("read"))
    #因为标签内部属性是键值对的方式href=""
    #UrlList = []
    #for item in GetA:
    #    UrlList.append("http://www.shengxu6.com" + item["href"])
    #return UrlList
    return ["http://www.shengxu6.com" + item["href"] for item in GetA]

def Test2():
    url = "http://www.shengxu6.com/book/2967.html"
    page = OpenPage(url)
    print ParseMainPage(page)

#解析一个章节内容,获取标题和正文
def ParseDetailPage(page):
    #先进行格式化
    soup = BeautifulSoup(page,"html.parser")
    #get_text() 方法,用于获取标签正文
    Title = soup.find_all(class_="panel-heading")[0].get_text()
    Content = soup.find_all(class_="content-body")[0].get_text() 
    return Title,Content

def Test3():
    url = "http://www.shengxu6.com/read/2967_2008175.html"
    page = OpenPage(url)
    print ParseDetailPage(page)

def WriteDataToFile(data):
    #f = open("output.txt","a+")
    #f.close()
    with open("output.txt","a+") as f:
        f.write(data)

def Test4():
    WriteDataToFile("dnaidnasod")         



if __name__ == "__main__":
    url = raw_input("请输入要爬取的小说地址:")
    #打开主页获取主页内容
    page = OpenPage(url)
    print "Clone Begin"
    UrlList = ParseMainPage(page)
    for item in UrlList:
        #每个item都是一个章节的url地址
        detail = OpenPage(item)
        #解析章节内容,获取标题和正文
        Title,Content = ParseDetailPage(detail)
        print "Clone " + Title
        data = "\n\n" + Title + "\n\n" + Content
        #将数据写入文件
        WriteDataToFile(data.encode("utf-8"))
    print "Clone Done"
Logo

CSDN联合极客时间,共同打造面向开发者的精品内容学习社区,助力成长!

更多推荐