python selenium 抓取 数据,手动点击下一页
#!/usr/bin/env python# -*- coding:utf-8 -*-from selenium import webdriverfrom bs4 import BeautifulSoup as bsfrom 通用列表爬虫.util.rqst import get_htmlfrom 通用列表爬虫.util.common import printred, saveTxt, rewri
·
#!/usr/bin/env python
# -*- coding:utf-8 -*-
from selenium import webdriver
from bs4 import BeautifulSoup as bs
from 通用列表爬虫.util.rqst import get_html
from 通用列表爬虫.util.common import printred, saveTxt, rewriteTxt, getRewriteTxt
class douyu():
# driver = webdriver.Chrome(executable_path=r'D:\Program Files\chromedriver.exe')
driver = webdriver.PhantomJS(executable_path=r'D:\Program Files\phantomjs-2.1.1-windows\bin\phantomjs.exe')
num = 0
count = 0
# 测试方法必须有test字样开头
def testDouyu(self):
self.driver.get("http://catalogue.hyve.ru/ru-RU/exhibitorlist.aspx?project_id=456")
while True:
self.count = self.count +1
if(self.count>9): # 第九页
printred("DONE")
return
printred("=================开始 "+str(self.count)+"===============")
soup = bs(self.driver.page_source, "lxml")
ids = soup.find_all("a", {"class" : "popUp"})
for i in ids:
a= i.find("div", {"class" : "name"}).text.replace('\n','')
b= i.find("div", {"class" : "country"}).text.replace('\n','')
c= i.find("div", {"class" : "pavilion"}).text.replace('\n','')
d= i.find("div", {"class" : "stand"}).text.replace('\n','')
h = 'http://catalogue.hyve.ru'+i['href']
html = bs(get_html(h), "lxml")
ctt = html.find("div", {"class" : "scorecard"})
j = ctt.h2.text
k = ctt.p.text.replace('\n','')
l = ctt.find_all("p")[2].text.replace('\n','')
m = ctt.find_all("p")[3].text.replace('\n','')
n = ctt.find_all("p")[4].text.replace('\n','')
o = ctt.find_all("p")[5].text.replace('\n','')
p = ctt.find_all("p")[7].text.replace('\n','')
q = ctt.find_all("p")[8].text.replace('\n','')
r = ctt.find_all("p")[9].text
print(
a+'\t'
+b+'\t'
+c+'\t'
+d+'\t'
+h+'\t'
+j+'\t'
+k+'\t'
+l+'\t'
+m+'\t'
+n+'\t'
+o+'\t'
+p+'\t'
+q+'\t'
+r+'\t'
)
saveTxt('1.txt', a+'\t'
+b+'\t'
+c+'\t'
+d+'\t'
+h+'\t'
+j+'\t'
+k+'\t'
+l+'\t'
+m+'\t'
+n+'\t'
+o+'\t'
+p+'\t'
+q+'\t'
+r+'\t')
print("==============================")
self.driver.find_element_by_css_selector(r'''#form > section > div > div.pager > a:nth-child(12)''').click()
# # 房间号, 返回列表
# ids = soup.find_all("a", {"class" : "play-list-link"})
# # 房间名, 返回列表
# names = soup.find_all("h3", {"class" : "ellipsis"})
def tearDown(self):
# 退出Firefox()浏览器
# print ("当前网站直播人数" + str(self.num))
# print ("当前网站观众人数" + str(self.count))
self.driver.quit()
if __name__ == "__main__":
# 启动测试模块
d = douyu()
d.testDouyu()
d.tearDown()
更多推荐
已为社区贡献1条内容
所有评论(0)