python selenium 抓取数据，手动点击下一页

#!/usr/bin/env python# -*- coding:utf-8 -*-from selenium import webdriverfrom bs4 import BeautifulSoup as bsfrom 通用列表爬虫.util.rqst import get_htmlfrom 通用列表爬虫.util.common import printred, saveTxt, rewri

i学长的猫

1374人浏览 · 2021-04-14 09:45:31

i学长的猫 · 2021-04-14 09:45:31 发布

#!/usr/bin/env python
# -*- coding:utf-8 -*- 
from selenium import webdriver
from bs4 import BeautifulSoup as bs 
from 通用列表爬虫.util.rqst import get_html
from 通用列表爬虫.util.common import printred, saveTxt, rewriteTxt, getRewriteTxt
class douyu():
    # driver = webdriver.Chrome(executable_path=r'D:\Program Files\chromedriver.exe')
    driver =  webdriver.PhantomJS(executable_path=r'D:\Program Files\phantomjs-2.1.1-windows\bin\phantomjs.exe')
    num = 0
    count = 0
    # 测试方法必须有test字样开头
    def testDouyu(self):
        self.driver.get("http://catalogue.hyve.ru/ru-RU/exhibitorlist.aspx?project_id=456")
        while True:
            self.count = self.count +1
            if(self.count>9):  # 第九页
                printred("DONE")
                return
            printred("=================开始 "+str(self.count)+"===============")
            soup = bs(self.driver.page_source, "lxml")
            ids = soup.find_all("a", {"class" : "popUp"})
            for i in ids:
                a= i.find("div", {"class" : "name"}).text.replace('\n','')
                b= i.find("div", {"class" : "country"}).text.replace('\n','')
                c= i.find("div", {"class" : "pavilion"}).text.replace('\n','')
                d= i.find("div", {"class" : "stand"}).text.replace('\n','')
                h = 'http://catalogue.hyve.ru'+i['href']
                html  = bs(get_html(h), "lxml")
                ctt = html.find("div", {"class" : "scorecard"})
                j = ctt.h2.text
                k = ctt.p.text.replace('\n','')
                l = ctt.find_all("p")[2].text.replace('\n','')
                m = ctt.find_all("p")[3].text.replace('\n','')
                n = ctt.find_all("p")[4].text.replace('\n','')
                o = ctt.find_all("p")[5].text.replace('\n','')
                p = ctt.find_all("p")[7].text.replace('\n','')
                q = ctt.find_all("p")[8].text.replace('\n','')
                r = ctt.find_all("p")[9].text
                print(
                    a+'\t'
                    +b+'\t'
                    +c+'\t'
                    +d+'\t'
                    +h+'\t'
                    +j+'\t'
                    +k+'\t'
                    +l+'\t'
                    +m+'\t'
                    +n+'\t'
                    +o+'\t'
                    +p+'\t'
                    +q+'\t'
                    +r+'\t'
                      )
                saveTxt('1.txt',  a+'\t'
                        +b+'\t'
                        +c+'\t'
                        +d+'\t'
                        +h+'\t'
                        +j+'\t'
                        +k+'\t'
                        +l+'\t'
                        +m+'\t'
                        +n+'\t'
                        +o+'\t'
                        +p+'\t'
                        +q+'\t'
                        +r+'\t')
                print("==============================")
            self.driver.find_element_by_css_selector(r'''#form > section > div > div.pager > a:nth-child(12)''').click()
            # # 房间号, 返回列表
            # ids = soup.find_all("a", {"class" : "play-list-link"})
            # # 房间名, 返回列表
            # names = soup.find_all("h3", {"class" : "ellipsis"})

def tearDown(self):
    # 退出Firefox()浏览器
    # print ("当前网站直播人数" + str(self.num))
    # print ("当前网站观众人数" + str(self.count))
    self.driver.quit()
if __name__ == "__main__":
    # 启动测试模块
    d  =  douyu()
    d.testDouyu()
    d.tearDown()

CSDN学习社区

CSDN联合极客时间，共同打造面向开发者的精品内容学习社区，助力成长！

更多推荐