Pyrthon2.7 爬取微博热搜
闲来无聊,搞搞热搜,先拿微博练手将爬取下来的数据发到自己的邮箱相关的moudle通过pip install就行效果展示什么都是扯淡,直接贴代码# -*- coding: utf-8 -*-#!/usr/bin/pythonimport requestsfrom bs4 import BeautifulSoupimport smtplibfrom email.mim...
·
闲来无聊,搞搞热搜,先拿微博练手
将爬取下来的数据发到自己的邮箱
相关的moudle通过pip install就行
什么都是扯淡,直接贴代码
# -*- coding: utf-8 -*-
#!/usr/bin/python
import requests
from bs4 import BeautifulSoup
import smtplib
from email.mime.text import MIMEText
from email.header import Header
import traceback
import time
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
weibo_url = "http://s.weibo.com"
class HotSearchInfo:
def __init__(self, isForceTop, index, title, url, num, flag):
self.isForceTop = isForceTop
self.index = index
self.title = title
self.url = url
self.num = num
self.flag = flag
def __str__(self):
return u'置顶: %s, 排名: %s, 标题: %s, 链接: %s, 热度: %s, 标识: %s' % \
(self.isForceTop, self.index, self.title, self.url, self.num, self.flag)
# 获取热搜页面
def get_html():
headers={
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
}
data = {
'cate':'realtimehot'
}
html = ""
try:
r = requests.get('%s/top/summary?' % (weibo_url), params=data, headers=headers)
if r.status_code == 200:
html = r.text
except:
print "error"
return html
# 解析热搜页面获取热搜列表数据
def parse_html(html):
soup = BeautifulSoup(html, 'lxml')
# 获取数据所在的div
table = soup.find("div", attrs={"id": "pl_top_realtimehot"})
# 获取热搜数据所在的tbody
tbody = table.find("tbody")
# 获取真正的热搜数据所有的列表
trs = tbody.find_all('tr')
result = []
# 遍历获取每一个热搜的信息
for tr in trs:
td01 = tr.find("td", attrs={"class": "td-01"})
td02 = tr.find("td", attrs={"class": "td-02"})
td03 = tr.find("td", attrs={"class": "td-03"})
# 是否强制置顶
_isForceTop = td01.find("i", attrs={"class": "icon-top"})
isForceTop = 0
if _isForceTop is not None:
isForceTop = 1
# 排名
_index = td01.text
index = 0
if _index is not None and _index != "":
index = eval(_index)
# 标题
title = td02.find("a").text
# 链接
_url = td02.find("a")['href']
url = ""
if _url is not None and _url != "":
url = "%s%s" % (weibo_url, _url)
# 热度
num_text = td02.find("span")
num = 0
if num_text is not None:
num = eval(num_text.text)
# 标识
flag_text = td03.find("i")
flag = u"无"
if flag_text is not None:
flag = flag_text.text
result.append(HotSearchInfo(isForceTop, index, title, url, num, flag))
return result
# 输出
def output(tr):
result = parse_html(tr)
content = build_content(result)
send_mail(content)
# for info in result:
# print info.__str__()+'\n'
def build_content(list):
if list is None:
return u""
table = u"<html><table><thead><tr>序号</tr><tr>关键词</tr><tr>热度标识</tr><thead><tbody>%s</tbody></table><html>"
trs = []
for info in list:
tr = u"<tr><td>%s</td><td><a href=\"%s\"/>%s<span>%s</span></td><td>%s</td></tr>" % \
(info.index, info.url, info.title, info.num, info.flag)
trs.append(tr)
result = table % u"".join(trs)
return result
def send_mail(content):
sender = 'xxx@163.com'
pwd = "xxx"
receivers = ['xxx@163.com']
message = MIMEText(content, 'html', 'utf-8')
message['Subject'] = Header('微博热搜榜单-%s' % time.strftime("%Y-%m-%d"), 'utf-8')
message['From'] = sender
message['To'] = ",".join(receivers)
try:
smtp_obj = smtplib.SMTP_SSL()
smtp_obj.connect("smtp.163.com")
# 此处密码是需要去邮箱设置的授权码,不是邮箱密码
smtp_obj.login(sender, pwd)
smtp_obj.sendmail(sender, receivers, message.as_string())
print "邮件发送成功"
except smtplib.SMTPException, e:
print "Error: 无法发送邮件, %s" % traceback.format_exc()
finally:
if smtp_obj:
smtp_obj.close()
def main():
html = get_html()
if html is None:
print "get none"
return
output(html)
main()
千万不要使用阿里云的邮箱,发不出去
更多推荐
已为社区贡献2条内容
所有评论(0)