Python3 - 抓取静态页面(图片)
python 3.4#!/usr/bin/env python# coding=utf-8import urllibimport urllib.requestimport reimport timefrom threading import *from bs4 import BeautifulSoup# 控制共享资源的访问数量screenLock = Semaphore(
·
python 3.4
#!/usr/bin/env python
# coding=utf-8
import urllib
import urllib.request
import re
import time
from threading import *
from bs4 import BeautifulSoup
# 控制共享资源的访问数量
screenLock = Semaphore(value = 1)
# headers
headers = {
'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.63 Safari/537.36'
}
main_url = 'http://www.symmz.com'
num = 1
# set 无序不重复元素集
pages = set()
pages.add(main_url)
# 防止jpg 重复
srces = set()
def downloadimg(url,depth):
if depth != 0:
print (depth)
print (url)
req = urllib.request.Request(url, headers=headers)
try:
res_html = urllib.request.urlopen(req).read().decode('utf-8')
except urllib.HTTPError:
print ('异常~')
# 创建 BeautifulSoup对象
soup = BeautifulSoup(res_html,'html.parser')
imgurllist = soup.find_all('img',{'src':re.compile(r'http://.+.jpg')})
urllist = soup.find_all('a',{'href':re.compile(r'/.+?/.+?.html')})
local_pash = './symmzImg01/'
# 全局变量
global num
for item in imgurllist:
print (" src:" + item['src'])
url = item['src']
if url not in srces:
path = local_pash + str(num) + '.jpg'
# 直接将远程文件下载到本地
urllib.request.urlretrieve(url,path)
# 将图片url 存到set中
srces.add(url)
num += 1
# 锁
screenLock.acquire()
print (str(num) + '.jpg 已下载\n')
screenLock.release()
else:
print('url 重复~'+url)
# 深搜
for url in urllist:
if url not in pages:
global main_url
pattern = re.compile(r'http://')
newurl = main_url + url['href']
# 判断url 中是否有两个 http
if(len(re.findall(pattern,newurl)) > 1):
newurl = url['href']
print('newurl:'+newurl)
downloadimg(newurl, depth-1)
# 不要加入 重复url
pages.add(url)
time.sleep(1)
else:
return
def start():
downloadimg(main_url,3)
if __name__ == '__main__':
start()
更多推荐
已为社区贡献8条内容
所有评论(0)