pythonXXX云所有厂商
第一步:#!usr/bin/env python#coding=utf-8#url列表import urllib,urllib2,renumber=raw_input('enter a number:\n')link='http://www.wooyun.org/corps/page/'newf=open('xh.txt','w')for s in range(int(nu
·
第一步:
#!usr/bin/env python
#coding=utf-8
#url列表
import urllib,urllib2,re
number=raw_input('enter a number:\n')
link='http://www.wooyun.org/corps/page/'
newf=open('xh.txt','w')
for s in range(int(number)):
f=str(link+str(s+1))
newf.writelines(f+'\n')
if number==0:
pass
else:
print f
newf.close()
door=open('xh.txt','r')
print door.readlines()
door.close()
第二步:
#!usr/bin/env python
#coding=utf-8
#完整爬虫
import string,urllib2,urllib,re
send_headers = {
'Host':'www.wooyun.org',
'User-Agent':'Mozilla/5.0 (Windows NT 6.2; rv:16.0) Gecko/20100101 Firefox/16.0',
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Connection':'keep-alive'
}
newf=open('result.txt','w')
def jx():
door=open('xh.txt','r')
t=door.readlines()
for m in t:
url=m
#print url
url=m
req = urllib2.Request(url,headers=send_headers)
r=urllib2.urlopen(req)
html=r.read()
unicodepage=html.decode('utf-8')
#print unicodepage
myitems=re.findall('_blank">.*?</a>',unicodepage,re.S)
items=[]
for items in myitems:
items=items.replace('_blank">','')
items=items.replace('</a>','')
items=items.replace('<img src="/images/sae_bottom_logo.png" title="Powered by Sina App Engine"></a-->','')
items=items.replace('</span>','')
items=items.replace('<span class="other fright">','')
items=items.replace('<a href="/impression">行业观点</a>','')
items=items.replace('乌云招聘','')
items=items.replace('知识库','')
items=items.replace('<a href="/impression">行业观点','')
items=items.replace('http://','')
items=items.replace('/','')
items=items.replace(',','\n')
print items
newf.writelines(items+'\n')
door.close()
newf.close()
jx()
收工
更多推荐
所有评论(0)