whaledata爬虫task-4之爬虫实战

4.1 Task7实战大项目实战大项目：模拟登录丁香园，并抓取论坛页面所有的人员基本信息与回复帖子内容。丁香园论坛：http://www.dxy.cn/bbs/thread/626626#626626 。思路模拟登录丁香园论坛抓取评论和用户的个人主页链接抓取用户个人主页信息数据存储#!/usr/bin/env python# -*- coding: u...

whalefall

321人浏览 · 2019-04-13 21:09:39

whalefall · 2019-04-13 21:09:39 发布

4.1 Task7 实战大项目

实战大项目：模拟登录丁香园，并抓取论坛页面所有的人员基本信息与回复帖子内容。

丁香园论坛：http://www.dxy.cn/bbs/thread/626626#626626 。

思路

模拟登录丁香园论坛
抓取评论和用户的个人主页链接
抓取用户个人主页信息
数据存储

#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
@author: Jock
"""

import time
from selenium import webdriver
from selenium.common.exceptions import TimeoutException, NoSuchElementException
import requests
from lxml import etree
from copy import deepcopy

def login_dxy(url):
    '''
   模拟登录浏览器
    '''
    # 声明浏览器对象
    driver = webdriver.Chrome()
    try:
        # 获取网页
        driver.get(url)
        # 最大化窗口
        driver.maximize_window()
        # 设置隐式等待
        driver.implicitly_wait(4)
        driver.find_element_by_link_text('登录').click()
        driver.find_element_by_link_text('返回电脑登录').click()
        driver.find_element_by_name('username').send_keys('自己的帐号名')
        driver.find_element_by_name('password').send_keys('自己的密码')
        driver.find_element_by_class_name('button').click()
        time.sleep(10)
        print("登录成功")
        # 抓取网页信息
        html = driver.page_source
        print(len(html))  # 测试爬取成功与否
        print(type(html))  # 测是抓取内容的类型
    except TimeoutException:
        print("Time out")
        print("登录失败！")
    except NoSuchElementException:
        print("No Element")
        print("登录失败！")
    if html:
        print("抓取成功！")
        return html
    else:
        print("抓取失败！")
        return None

def get_user_info(url):
    try:
        user_agent = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 \
        (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'
        headers = {'User-Agent': user_agent}
        r = requests.get(url, headers=headers)  # 爬取完整的网页数据
        r.raise_for_status()  # 如果状态不是200，引发HTTPError异常
        info = []
        t = etree.HTML(r.text)
        # 提取用户名
        info.append(t.xpath('//div[@class="banner-inner__user-id pa"]/a/text()')[0])
        # 提取用户等级
        info.append(t.xpath('//div[@class="user-level-area"]/text()')[0])
        # 提取用户关注数、粉丝数、丁当数
        info.extend(t.xpath('//div[@class="follows-fans clearfix"]/div/p/a/text()'))
        # 提取用户帖子数、精华数、积分数、得票数
        info.extend(t.xpath('//div[@class="main-nav-inner"]/ul/li/span[1]/text()'))
        print(info)
        return info
    except:
        print("访问出错")
        return ""  # 发生异常，返回空字符串


def extract_data(html):
    # 做好ElementTree
    tree = etree.HTML(html)
    # 列表ls_content存储发表内容
    ls_content = []
    # 以列表形式，返回所有包含发表内容的td标签
    ls = tree.xpath('//td[@class="postbody"]')
    n = len(ls)
    for i in range(n):
        try:
            ls_content.append('\n'.join(ls[i].xpath('.//text()')).strip())  # 把每个td标签中的文本放入列表中
        except:
            print('出错')
            continue
    # 获取用户个人主页网址,最后一个是抓取自己的
    ls_urls = tree.xpath('//div[@class="auth"]/a/@href')
    # 用于存储用户个人基本信息
    ls_user_info = []
    n = 0
    for url in ls_urls:
        n += 1
        print("现在开始抓取第{}位用户的主页：{}".format(n, url))
        info = get_user_info(str(url))
        ls_user_info.append(info)
    ls_total = list(zip(ls_user_info, ls_content))
    print(ls_total[0])
    print("恭喜你！成功抓取信息{}条！".format(len(ls_total)))
    return ls_total

def save_data(ls_total, fpath):
    n = 0
    with open(fpath, 'a', encoding='utf-8') as f:  # 以可读可写的权限打开文件
        for i in ls_total:
            n += 1
            try:
                print("现在开始写入第{}位用户的信息".format(n))
                p = deepcopy(i[0])
                p.append(i[1])
                print(p)  # 测试输出
                s = ','.join(p) + '\n'
                f.write(s)  # 写入数据
            except:
                print("警告！第{}条信息写入出错！".format(n))
                continue

def main():
    url = 'http://www.dxy.cn/bbs/thread/626626#626626'
    fpath = r'C:\Users\admin\Desktop\丁香园用户信息.csv'
    html = login_dxy(url)
    ls_total = extract_data(html)
    save_data(ls_total, fpath)
    print("成功结束程序！")
# 测试时间
def count_spend_time(func):
    start_time = time.perf_counter()
    func()
    end_time = time.perf_counter()
    time_dif = (end_time - start_time)
    second = time_dif%60
    minute = (time_dif//60)%60
    hour = (time_dif//60)//60
    print('spend ' + str(hour) + 'hours,' + str(minute) + 'minutes,' + str(second) + 'seconds')

if __name__ == '__main__':
    count_spend_time(main)

输出：

```登录成功
346566
<class 'str'>
抓取成功！
现在开始抓取第1位用户的主页：http://i.dxy.cn/profile/楼医生
['楼医生', '常驻站友', '2', '35', '168', '63', '0', '3', '1']
现在开始抓取第2位用户的主页：http://i.dxy.cn/profile/lion000
...（省略)
现在开始写入第25位用户的信息
['lingguang', '常驻站友', '3', '0', '115', '42', '0', '1', '6', '应当是一种肾上腺依赖性的，在受惊吓时肾上腺素大幅度升高，出现恶性心律失常而致晕厥。我不知道当时HOLTER检查时有没有发作？当时若有发作时的心电图最有说服力。我认为应当排除这方面的情况。\n必要的时候作惊吓试验，当然现场要有心电监护以及抢救措施。']
现在开始写入第26位用户的信息
['呆子', '铁杆站友', '21', '12', '106', '88', '0', '11', '4', '考虑为癫痫，建议动态脑电图，若阴性。则试用抗癫痫药']
现在开始写入第27位用户的信息
['lingguang', '常驻站友', '3', '0', '115', '42', '0', '1', '6', '应当做惊吓试验，脑电图与心电图同时监测。我想肯定会有结果的。无非是两种情况：脑源性与心源性。']
成功结束程序！
spend 0.0hours,2.0minutes,8.271899323000014seconds