小说网站爬虫
最近刚学完基础部分,写个爬虫充实一下自己。#!/usr/bin/python#coding:utf-8'''爬某小说网站'''import urllib2import refrom bs4 import BeautifulSoup#根据指定url获取服务器端响应def OpenPage(url):Myheaders = {}#urllib2.R...
·
最近刚学完基础部分,写个爬虫充实一下自己。
#!/usr/bin/python
#coding:utf-8
'''
爬某小说网站
'''
import urllib2
import re
from bs4 import BeautifulSoup
#根据指定url获取服务器端响应
def OpenPage(url):
Myheaders = {}
#urllib2.Request 构造请求
req = urllib2.Request(url,headers=Myheaders)
#相当于,在浏览器的地址栏,输入了网址
#激活请求,获取响应
f = urllib2.urlopen(req)
#服务器端响应类文本对象,通过read()方法读取响应内容
data = f.read()
#三种处理方式ignore replace xml..replace
return data.decode("GBK",errors="ignore").encode("utf-8")
def Test1():
url = "http://www.shengxu6.com/book/2967.html"
print OpenPage(url)
#从主页解析数据,获取各个章节的跳转链接url
def ParseMainPage(page):
#调用爬虫库提供的相关方法,进行数据分析
#html.parser python自带的html解析引擎
soup = BeautifulSoup(page,"html.parser")
#find_all方法在全文内容里搜索符合内容的标签,返回一个列表
#检索所有的href属性值中包含read字符串的标签
GetA = soup.find_all(href=re.compile("read"))
#因为标签内部属性是键值对的方式href=""
#UrlList = []
#for item in GetA:
# UrlList.append("http://www.shengxu6.com" + item["href"])
#return UrlList
return ["http://www.shengxu6.com" + item["href"] for item in GetA]
def Test2():
url = "http://www.shengxu6.com/book/2967.html"
page = OpenPage(url)
print ParseMainPage(page)
#解析一个章节内容,获取标题和正文
def ParseDetailPage(page):
#先进行格式化
soup = BeautifulSoup(page,"html.parser")
#get_text() 方法,用于获取标签正文
Title = soup.find_all(class_="panel-heading")[0].get_text()
Content = soup.find_all(class_="content-body")[0].get_text()
return Title,Content
def Test3():
url = "http://www.shengxu6.com/read/2967_2008175.html"
page = OpenPage(url)
print ParseDetailPage(page)
def WriteDataToFile(data):
#f = open("output.txt","a+")
#f.close()
with open("output.txt","a+") as f:
f.write(data)
def Test4():
WriteDataToFile("dnaidnasod")
if __name__ == "__main__":
url = raw_input("请输入要爬取的小说地址:")
#打开主页获取主页内容
page = OpenPage(url)
print "Clone Begin"
UrlList = ParseMainPage(page)
for item in UrlList:
#每个item都是一个章节的url地址
detail = OpenPage(item)
#解析章节内容,获取标题和正文
Title,Content = ParseDetailPage(detail)
print "Clone " + Title
data = "\n\n" + Title + "\n\n" + Content
#将数据写入文件
WriteDataToFile(data.encode("utf-8"))
print "Clone Done"
更多推荐
已为社区贡献1条内容
所有评论(0)