第一步:

#!usr/bin/env python
#coding=utf-8
#url列表
import urllib,urllib2,re


number=raw_input('enter a number:\n')
link='http://www.wooyun.org/corps/page/'
newf=open('xh.txt','w')

for s in range(int(number)):
	f=str(link+str(s+1))
	newf.writelines(f+'\n')

	if number==0:
		pass
	else:
		print f

newf.close()

door=open('xh.txt','r')
print door.readlines()
door.close()


第二步:

#!usr/bin/env python
#coding=utf-8
#完整爬虫

import string,urllib2,urllib,re

send_headers = {
 'Host':'www.wooyun.org',
 'User-Agent':'Mozilla/5.0 (Windows NT 6.2; rv:16.0) Gecko/20100101 Firefox/16.0',
 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 'Connection':'keep-alive'
}
newf=open('result.txt','w')


def jx():
	door=open('xh.txt','r')
	t=door.readlines()
	for m in t:
		url=m
		#print url
		url=m
		req = urllib2.Request(url,headers=send_headers)
		r=urllib2.urlopen(req)
		html=r.read()
		unicodepage=html.decode('utf-8')
		#print unicodepage
		myitems=re.findall('_blank">.*?</a>',unicodepage,re.S)
		items=[]
		for items in myitems:
			items=items.replace('_blank">','')
			items=items.replace('</a>','')
			items=items.replace('<img src="/images/sae_bottom_logo.png" title="Powered by Sina App Engine"></a-->','')
			items=items.replace('</span>','')
			items=items.replace('<span class="other fright">','')
			items=items.replace('<a href="/impression">行业观点</a>','')
			items=items.replace('乌云招聘','')
			items=items.replace('知识库','')
			items=items.replace('<a href="/impression">行业观点','')
			items=items.replace('http://','')
			items=items.replace('/','')
			items=items.replace(',','\n')
			print items
			newf.writelines(items+'\n')
	door.close()
	newf.close()
jx()


收工

Logo

CSDN联合极客时间,共同打造面向开发者的精品内容学习社区,助力成长!

更多推荐