selenium 爬虫
1.事例:启信宝2.浏览器:火狐,谷歌,phantomjs均可以使用3.该事例中对selenium的方法进行了封装,读者可以pip install SpiderTool==19.1.1该模块对selenium的方法镜像了更细的封装,方便快速开发4.代码样例:#!/usr/bin/env python# _*_ coding:utf-8 _*_"""File:.pyAu
·
1.事例:启信宝
2.浏览器:火狐,谷歌,phantomjs均可以使用
3.该事例中对selenium的方法进行了封装,读者可以
pip install SpiderTool==19.1.1
该模块对selenium的方法镜像了更细的封装,方便快速开发
4.代码样例:
#!/usr/bin/env python
# _*_ coding:utf-8 _*_
"""
File: .py
Author: Lijiacai ()
Date: 2018-12-29
Description:
"""
import os
import re
import sys
import random
from SpiderTool import Request
from SpiderTool import Browser
from loggingtool import loggingtool
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from lxml import etree
# 这是一个日志记录器,相关loggingtool,通过pip install loggingtool可以查看源码
logging = loggingtool.init_log("qxb", "console", level="NOTSET")
cur_dir = os.path.split(os.path.realpath(__file__))[0]
sys.path.append("%s/" % cur_dir)
def proxy_deal(proxies):
"""
get proxy
If there are other agents, change the function here.
:return: return a ip:12.23.88.23:2345
"""
if not proxies:
one_proxy = None
elif type(proxies) == list:
one_proxy = random.choice(proxies)
else:
one_proxy = None
if one_proxy == None:
logging.info("self ip")
return one_proxy
class MyRequest(Request.Request):
"""change proxy function to use api proxy"""
def proxy(self):
"""
get proxy
If there are other agents, change the function here.
:return: return a ip:12.23.88.23:2345
"""
one_proxy = proxy_deal(self.proxies)
return one_proxy
class MyBrowser(Browser.Browser):
"""change proxy function to use api proxy"""
def proxy(self):
"""
get proxy
If there are other agents, change the function here.
:return: return a ip:12.23.88.23:2345
"""
one_proxy = proxy_deal(self.proxies)
return one_proxy
class Qxb(object):
"""qixinbao code"""
def __init__(self, proxies=None):
self.proxies = proxies
def search_page(self, keyword):
"""
搜索页
:param keyword: 随意给,e.g 百度
:return:
"""
result = []
try:
url = "https://m.qixin.com/"
browser = MyBrowser(proxies=self.proxies, headless=False, timeout=20,
executable_path=None,
browser_type="Firefox")
browser.get(url=url)
input_k = browser.find_element(value=u"//input[@placeholder='请输入企业名,人名,品牌名等']",
by=By.XPATH)
browser.send_keys(input_k, keyword)
browser.keys(input_k, keyboard=Keys.ENTER)
try:
browser.wait_for_element_loaded("btn-primary", elem_type=By.CLASS_NAME,
wait_time=3)
button_k = browser.find_elements(value=u"btn-primary", by=By.CLASS_NAME)
for i in button_k:
browser.click_elem(i)
except Exception as e:
logging.exception(u"No validate")
browser.implicitly_wait(3)
html = browser.page_source()
page = etree.HTML(html)
href_s = page.xpath("//a/@href")
for href in href_s:
if re.findall(r"^/company/", href):
company = {
"url": "https://m.qixin.com%s" % href,
"company_id": href,
"company_name": "",
"company_status": "",
"history_names": []
}
result.append(company)
except Exception as e:
logging.exception(str(e))
return result
def result_page(self, url, company_id=None, **kwargs):
"""获取对应搜索页的详细工商信息"""
url = url + "/info/"
browser = MyBrowser(proxies=self.proxies, headless=False, timeout=20,
executable_path=None,
browser_type="Firefox")
browser.get(url="view-source:" + url)
try:
browser.wait_for_element_loaded("btn-primary", elem_type=By.CLASS_NAME,
wait_time=3)
button_k = browser.find_elements(value=u"btn-primary", by=By.CLASS_NAME)
for i in button_k:
browser.click_elem(i)
except Exception as e:
logging.exception(u"No validate")
# browser.get(url="view-source:" + url)
print browser.browser.current_url
browser.implicitly_wait(3)
html = browser.page_source()
return html
if __name__ == '__main__':
qxb = Qxb()
# qxb.search_page("baidu")
print qxb.result_page(url="https://m.qixin.com/company/5e5641da-211e-40ed-9629-b421f4cf1416")
# pass
更多推荐
已为社区贡献2条内容
所有评论(0)