python 下载百度贴吧图片
主程序#!/usr/bin/python# -*-coding:utf-8-*-import urllib.parse, urllib.request, http.cookiejar, re, timeimport toolsimport threadingfrom db import dbclass tieba(threading.Thread):'下载贴吧图片'
·
主程序
#!/usr/bin/python
# -*-coding:utf-8-*-
import urllib.parse, urllib.request, http.cookiejar, re, time
import tools
import threading
from db import db
class tieba(threading.Thread):
'下载贴吧图片'
# http://tieba.baidu.com/p/4519246742?see_lz=1&pn=1
url = 'http://tieba.baidu.com/p/%s?see_lz=1&pn=%s'
def __init__(self, tieid=4690733195, page=1):
threading.Thread.__init__(self)
self.tieid = tieid
self.page = page
@staticmethod
def getEndPage(tieid):
'获取帖子共多少页'
url = tieba.url % (tieid, 1)
res = urllib.request.urlopen(url)
text = res.read().decode('utf-8')
pattern = r'(\d+)</span>回复贴,共<span class="red">(\d+)</span>页'
match = re.search(pattern, text)
if match:
pages = match.group(2)
else:
pages = 1
return int(pages)
def run(self):
'线程'
url = tieba.url % (self.tieid, self.page)
res = urllib.request.urlopen(url)
text = res.read().decode('utf-8')
pattern = r'<img class="BDE_Image"([\s\S]*?)src="(.*?)"'
match = re.findall(pattern, text)
for i in match:
url = i[1]
print('第%s页,下载地址:%s' % (self.page, url))
tools.download(url)
@tools.runTime('tieba.log')
def main():
'主方法'
tieID = 4519246742 #贴子ID,http://tieba.baidu.com/p/4671247923 帖子ID:4671247923
endpage = tieba.getEndPage(tieID)
mysql = db('127.0.0.1', 'root', '', 'test')
sql = 'select * from tieba where tieid=%s order by id desc limit 1' % tieID
data = mysql.queryRow(sql)
# 每次下载5页
size = 5
if data:
if int(data['EndPage']) > endpage:
print("已到尾页,结束下载!")
exit()
start = data['EndPage'] + 1
end = start + size - 1
else:
start = 1
end = 5
sql = """
INSERT INTO `test`.`tieba` (
`tieid`,
`StartPage`,
`EndPage`)
VALUES('%s','%s', '%s');
""" % (tieID, start, end)
mysql.execute(sql)
threads = []
for page in range(start, end + 1):
if page > endpage:
print("已到尾页,结束下载!!")
break
thread = tieba(tieID, page)
thread.start()
threads.append(thread)
for t in threads:
# 等待所有线程完成
t.join()
print("退出主线程")
if __name__ == '__main__':
main()
tools.py
#!/usr/bin/python
# -*-coding:utf-8-*-
import time, random
import urllib.request
import os.path, re
'''
自定义工具方法,tools.py
'''
def runTime(file='test.log'):
def _runTime(func):
'记录程序运行时间'
def newFunc(*args, **kwargs):
start = time.clock()
log('开始任务', file)
res = func(*args, **kwargs)
end = time.clock()
msg = "结束任务,运行了: %f 秒" % (end - start)
log(msg, file)
print(msg)
return res
return newFunc
return _runTime
def log(content, file='test.log', type=1):
if type == 1:
f = open(file, 'a+', encoding='utf-8')
else:
f = open(file, 'w+', encoding='utf-8')
t = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
content = t + ' : ' + content + '\r'
f.write(content)
def download(url, filename='', foldername='', useOldName=False):
"""
:param url: str 文件下载地址
:param filename: str 下载后的文件名,默认:yyyymmddHHiiss+3位随机数
:param foldername: str 下载目录,默认 yyyy-mm-dd-HH,请尽量使用绝对路径,如(windows下):"D:\\360Downloads\\test"
:param useOldName: str 是否使用原文件名作为下载后的文件名,默认不使用
:return:
"""
if not url:
return
oldFileName = os.path.basename(url)
pattern = r'\.(.*?)$'
match = re.search(pattern, oldFileName)
suffix = match.group(1)
t = time.localtime(time.time())
if foldername == '':
foldername = str(t.__getattribute__("tm_year")) + "-" + str(t.__getattribute__("tm_mon")) + "-" + str(
t.__getattribute__("tm_mday")) + "-" + str(t.__getattribute__("tm_hour"))
picpath = foldername # 下载到的本地目录
if not os.path.exists(picpath): # 路径不存在时创建一个
os.makedirs(picpath)
if filename == '':
filename = time.strftime("%Y%m%d%H%M%S", time.localtime()) + str(random.randint(100, 999)) + '.' + suffix
if useOldName:
filename = oldFileName
target = picpath + '\\%s' % (filename,)
image = urllib.request.urlretrieve(url, target)
db.py
# -*- coding: utf-8 -*-
import pymysql
class db:
'数据库操作类'
dbconnect = '' # 数据库连接对象
error = '' # 错误信息
def __init__(self, host, username, password, db='', port=3306):
'构造方法'
try:
self.dbconnect = pymysql.connect(host, username, password, db, cursorclass=pymysql.cursors.DictCursor,
charset='utf8')
except pymysql.Error as e:
self.error = str(e)
pass
def __del__(self):
'析构方法'
self.close()
def execute(self, sql):
'执行sql'
if self.dbconnect == '':
return self.error
cursor = self.dbconnect.cursor()
db = self.dbconnect
try:
# 执行SQL语句
cursor.execute(sql)
# 提交到数据库执行
db.commit()
except:
# 发生错误时回滚
db.rollback()
return cursor
def queryAll(self, sql):
'执行一个select sql并放回结果'
if self.dbconnect == '':
return self.error
cursor = self.dbconnect.cursor()
cursor.execute(sql)
data = cursor.fetchall()
return data
def queryRow(self, sql):
'执行一个select sql并放回一条结果'
if self.dbconnect == '':
return self.error
cursor = self.dbconnect.cursor()
cursor.execute(sql)
data = cursor.fetchone()
return data
def queryScalar(self, sql):
'执行一个select sql并放回一条字段'
if self.dbconnect == '':
return self.error
data = self.queryRow(sql)
res = ''
values = data.values()
count = 0
for i in values:
count += 1
res = i
if count == 1:
break
return res
def close(self):
if self.dbconnect:
self.dbconnect.close()
if __name__ == '__main__':
db = db('172.23.16.91', 'unipei', 'jiaparts','jpd')
one = db.queryRow('select * from jpd.jpd_user limit 1')
all = db.queryAll('select * from jpd.jpd_organ limit 10')
count = db.queryScalar('select count(*) from jpd.jpd_organ')
delSql = 'delete from pap.pap_evaluation_system_history limit 100'
delRes = db.execute(delSql)
updata = 'update jpd.jpd_user set lastvisittime=1 where id=61'
updateRes = db.execute(updata)
print(one)
print(all)
print(count)
print(delRes.rowcount)
print(updateRes.rowcount)
sql
CREATE TABLE `tieba` (
`ID` int(11) NOT NULL AUTO_INCREMENT COMMENT '主键',
`TieID` bigint(11) DEFAULT NULL,
`StartPage` int(11) DEFAULT NULL COMMENT '开始页面',
`EndPage` int(11) DEFAULT NULL COMMENT '结束页面',
`CreateTime` int(13) DEFAULT NULL COMMENT '创建时间',
PRIMARY KEY (`ID`)
) ENGINE=InnoDB AUTO_INCREMENT=31 DEFAULT CHARSET=utf8;
更多推荐
已为社区贡献2条内容
所有评论(0)