urllib库学习 - 百思不得姐段子爬虫
概述使用urllib和re模块爬取百思不得姐段子源码# !/usr/bin/env python# -*- coding:utf-8 -*-"""\w+?"""import urllib.requestimport redef crawl():base_url='http://www.budejie.com/text/'for page i
·
概述
使用urllib和re模块爬取百思不得姐段子
源码
# !/usr/bin/env python
# -*- coding:utf-8 -*-
"""
<a\shref=".+?"\sclass="u-user-name" target="_blank">\w+?</a>
"""
import urllib.request
import re
def crawl():
base_url='http://www.budejie.com/text/'
for page in range(1,51):
url=base_url+str(page)
crawl_page(url)
def crawl_page(url):
headers={
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36'
}
req=urllib.request.Request(url,headers=headers)
resp=urllib.request.urlopen(req)
if resp.status==200:
html=resp.read().decode('utf-8')
"""
<div class="j-r-list-c-desc">\s+<a .*?>(.+?)</a>\s{0,}</div>
"""
# 用户名
username_pattern=r'class="u-user-name" target="_blank">(.+?)</a>'
usernames=re.compile(username_pattern,re.DOTALL).findall(html)
# 内容
text_parttern=r'<div class="j-r-list-c-desc">\s+<a .*?>(.+?)</a>\s{0,}</div>'
texts=re.compile(text_parttern,re.DOTALL).findall(html)
data=zip(usernames,texts)
with open('/home/brandon/PythonProjects/MySpider/data/百思不得姐段子.txt', mode='a') as f:
for i in data:
f.write('{}\n{}\n\n'.format(i[0],i[1]))
if __name__ == '__main__':
crawl()
运行结果
更多推荐



所有评论(0)