糗事百科爬虫改进
无事,抓糗事!看到一个哥们的代码,无事拿来改改,抓糗事百科文字内容#!/usr/bin/env python'''for qiushibaike.com'''importurllib2# importurllibimportreimportthreadimporttimeclass Spider_Model():def __init__
·
无事,抓糗事!
看到一个哥们的代码,无事拿来改改,抓糗事百科文字内容
#!/usr/bin/env python
'''
for qiushibaike.com
'''
import urllib2
# import urllib
import re
import thread
import time
class Spider_Model():
def __init__(self):
self.page = 1
self.pages = []
self.enable = False
def GetPage(self,page):
myurl = r'http://www.qiushibaike.com/textnew/page/'+page
user_agent = 'Mozilla/5.0 (X11; Linux x86_64)'
headers = {'User-Agent':user_agent}
req = urllib2.Request(myurl,headers=headers)
myres = urllib2.urlopen(req)
mypage = myres.read()
unicodepage = mypage.decode('utf-8')
myItems = re.findall('<div.*?class="content">(.*?)<!--.*?-->.*?</div>',unicodepage,re.S)
Items = []
# print myItems
for item in myItems:
# print item
item = item.replace('\n','')
Items.append(item.replace(r'<br/>','\n'))
# Items.append(item[0])
return Items
def LoadPage(self):
while self.enable:
if len(self.pages) < 2:
try:
mypage = self.GetPage(str(self.page))
self.page += 1
self.pages.append(mypage)
except:
print 'can not connected to the url.'
else:
time.sleep(1)
def ShowPage(self,nowPage,page):
print '\n\n############################ Page %d #################################\n\n' % page
for item in nowPage:
print item
myinput = raw_input()
if myinput == 'quit':
self.enable = False
break
def start(self):
page = self.page
self.enable = True
print(u'waiting..............')
thread.start_new_thread(self.LoadPage,())
while self.enable:
if self.pages:
nowpage = self.pages[0]
del self.pages[0]
self.ShowPage(nowpage,page)
page +=1
if __name__ == '__main__':
#---------the begin of program-----------------
print u'''
-------------------------------------------------
xxxx
x
xxx
xxx
-------------------------------------------------
'''
print 'Press any key,to continue......'
raw_input()
mymodel = Spider_Model()
mymodel.start()
一切从简。不解释不说明,随便拍!
详细内容请参考:http://blog.csdn.net/pleasecallmewhy/article/details/8932310
更多推荐
已为社区贡献2条内容
所有评论(0)