煎蛋网爬虫小练习
1 #!/usr/bin/python2 #coding:utf-83 import urllib.request4 import os5 import time6 import random78 def url_open(url):9# header = {}10# header['User-Agent'] = 'Mozilla/5...
·
1 #!/usr/bin/python
2 #coding:utf-8
3 import urllib.request
4 import os
5 import time
6 import random
7
8 def url_open(url):
9 # header = {}
10 # header['User-Agent'] = 'Mozilla/5.0 (Windows NT 6.1;WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 Safari/537.36 SE 2.X MetaSr 1.0'
11 req=urllib.request.Request(url)
12 req.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.1;WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 Safari/537.36 SE 2.X MetaSr 1.0')
13
14 proxylist=['111.192.44.204:9000','222.82.222.242:9999','124.202.247.110:8080']
15 proxy=random.choice(proxylist)
16 proxyhandler=urllib.request.ProxyHandler({'htttp':proxy})
17 opener=urllib.request.build_opener(proxyhandler)
18 urllib.request.install_opener(opener) request模块中建立opener
19 response = urllib.request.urlopen(url)
20 html=response.read()
21 return html
22
23 def find_img(url):
24 html=url_open(url).decode('utf-8') 从utf-8解码成unicode 反之encode(utf-8) 从unicode编码成utf-8
25 img_addrs=[]
26
27 a=html.find('img src=') 找链接起始位置a,结束位置b,然后b-a 切片
28 while a !=-1:
29 b=html.find('.jpg',a,a+100)
30 if b!=-1:
31 img_addrs.append('http:'+html[a+9:b+4]) 将链接放到list中
32 else:
33 b=a+9
34 print('A')
35 a=html.find('img src=',b)
36 for each in img_addrs: 对list进行itrate
37 print (each)
38 return img_addrs
39
40 def save_img(folder,img_addrs): 保存图片
41 print(img_addrs)
42 for each in img_addrs:
43 # filename=each.split('/')[-1]
44 global j
45 j += 1
46 with open(str(j)+'.jpg','wb') as f:
47 img=url_open(each)
48 f.write(img)
49
50 def download_mm(folder="d://xx22"): 主函数
51 os.mkdir(folder)
52 os.chdir(folder)
53 global j
54 j=0
55
56 url="http://jiandan.net/ooxx/"
57 page_num=26
58 for i in range(page_num):
59 page_num-=1
60 page_url=url+'page-'+str(page_num)+'#comments'
61 print(page_num)
62
63 img_addrs=find_img(page_url)
64 save_img(folder,img_addrs)
65 time.sleep(1)
66
67 if __name__=="__main__":
68 download_mm()
更多推荐
已为社区贡献1条内容
所有评论(0)