1.事例:启信宝
2.浏览器:火狐,谷歌,phantomjs均可以使用
3.该事例中对selenium的方法进行了封装,读者可以

pip install SpiderTool==19.1.1

该模块对selenium的方法镜像了更细的封装,方便快速开发

4.代码样例:

#!/usr/bin/env python
# _*_ coding:utf-8 _*_

"""
File:   .py
Author: Lijiacai ()
Date: 2018-12-29
Description:
"""

import os
import re
import sys
import random
from SpiderTool import Request
from SpiderTool import Browser
from loggingtool import loggingtool
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from lxml import etree

# 这是一个日志记录器,相关loggingtool,通过pip install loggingtool可以查看源码
logging = loggingtool.init_log("qxb", "console", level="NOTSET")

cur_dir = os.path.split(os.path.realpath(__file__))[0]
sys.path.append("%s/" % cur_dir)


def proxy_deal(proxies):
    """
    get proxy
    If there are other agents, change the function here.
    :return: return a ip:12.23.88.23:2345
    """
    if not proxies:
        one_proxy = None
    elif type(proxies) == list:
        one_proxy = random.choice(proxies)
    else:
        one_proxy = None
    if one_proxy == None:
        logging.info("self ip")
    return one_proxy


class MyRequest(Request.Request):
    """change proxy function to use api proxy"""

    def proxy(self):
        """
        get proxy
        If there are other agents, change the function here.
        :return: return a ip:12.23.88.23:2345
        """
        one_proxy = proxy_deal(self.proxies)
        return one_proxy


class MyBrowser(Browser.Browser):
    """change proxy function to use api proxy"""

    def proxy(self):
        """
        get proxy
        If there are other agents, change the function here.
        :return: return a ip:12.23.88.23:2345
        """
        one_proxy = proxy_deal(self.proxies)
        return one_proxy


class Qxb(object):
    """qixinbao code"""

    def __init__(self, proxies=None):
        self.proxies = proxies

    def search_page(self, keyword):
        """
        搜索页
        :param keyword: 随意给,e.g 百度
        :return:
        """
        result = []
        try:
            url = "https://m.qixin.com/"
            browser = MyBrowser(proxies=self.proxies, headless=False, timeout=20,
                                executable_path=None,
                                browser_type="Firefox")

            browser.get(url=url)
            input_k = browser.find_element(value=u"//input[@placeholder='请输入企业名,人名,品牌名等']",
                                           by=By.XPATH)
            browser.send_keys(input_k, keyword)
            browser.keys(input_k, keyboard=Keys.ENTER)
            try:
                browser.wait_for_element_loaded("btn-primary", elem_type=By.CLASS_NAME,
                                                wait_time=3)
                button_k = browser.find_elements(value=u"btn-primary", by=By.CLASS_NAME)
                for i in button_k:
                    browser.click_elem(i)

            except Exception as e:
                logging.exception(u"No validate")
            browser.implicitly_wait(3)
            html = browser.page_source()
            page = etree.HTML(html)
            href_s = page.xpath("//a/@href")
            for href in href_s:
                if re.findall(r"^/company/", href):
                    company = {
                        "url": "https://m.qixin.com%s" % href,
                        "company_id": href,
                        "company_name": "",
                        "company_status": "",
                        "history_names": []
                    }
                    result.append(company)
        except Exception as e:
            logging.exception(str(e))
        return result

    def result_page(self, url, company_id=None, **kwargs):
        """获取对应搜索页的详细工商信息"""
        url = url + "/info/"
        browser = MyBrowser(proxies=self.proxies, headless=False, timeout=20,
                            executable_path=None,
                            browser_type="Firefox")

        browser.get(url="view-source:" + url)
        try:
            browser.wait_for_element_loaded("btn-primary", elem_type=By.CLASS_NAME,
                                            wait_time=3)
            button_k = browser.find_elements(value=u"btn-primary", by=By.CLASS_NAME)
            for i in button_k:
                browser.click_elem(i)
        except Exception as e:
            logging.exception(u"No validate")
        # browser.get(url="view-source:" + url)
        print browser.browser.current_url
        browser.implicitly_wait(3)
        html = browser.page_source()
        return html


if __name__ == '__main__':
    qxb = Qxb()
    # qxb.search_page("baidu")
    print qxb.result_page(url="https://m.qixin.com/company/5e5641da-211e-40ed-9629-b421f4cf1416")
    # pass

Logo

CSDN联合极客时间,共同打造面向开发者的精品内容学习社区,助力成长!

更多推荐