概述

使用urllib和re模块爬取百思不得姐段子

源码

# !/usr/bin/env python
# -*- coding:utf-8 -*-

"""
<a\shref=".+?"\sclass="u-user-name" target="_blank">\w+?</a>
"""

import urllib.request
import re


def crawl():
    base_url='http://www.budejie.com/text/'
    for page in range(1,51):
        url=base_url+str(page)
        crawl_page(url)


def crawl_page(url):
    headers={
        'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36'
    }
    req=urllib.request.Request(url,headers=headers)
    resp=urllib.request.urlopen(req)
    if resp.status==200:
        html=resp.read().decode('utf-8')
        """
        <div class="j-r-list-c-desc">\s+<a .*?>(.+?)</a>\s{0,}</div>
        """
        # 用户名
        username_pattern=r'class="u-user-name" target="_blank">(.+?)</a>'
        usernames=re.compile(username_pattern,re.DOTALL).findall(html)
        # 内容
        text_parttern=r'<div class="j-r-list-c-desc">\s+<a .*?>(.+?)</a>\s{0,}</div>'
        texts=re.compile(text_parttern,re.DOTALL).findall(html)

        data=zip(usernames,texts)

        with open('/home/brandon/PythonProjects/MySpider/data/百思不得姐段子.txt', mode='a') as f:
            for i in data:
                f.write('{}\n{}\n\n'.format(i[0],i[1]))

if __name__ == '__main__':
    crawl()

运行结果


Logo

CSDN联合极客时间,共同打造面向开发者的精品内容学习社区,助力成长!

更多推荐