爬虫多线程爬取大量代理并且检查可用性
代码示例#!/usr/bin/env python#-*- coding:utf-8 -*-"""@author: HJK"""import loggingimport osimport sysimport getoptimport datetimeimport reimport threadingimport platformimport requestsfrom ...
·
代码示例
#!/usr/bin/env python
#-*- coding:utf-8 -*-
"""
@author: HJK
"""
import logging
import os
import sys
import getopt
import datetime
import re
import threading
import platform
import requests
from fake_useragent import UserAgent
ua = UserAgent()
SITES = ['http://www.proxyserverlist24.top/', 'http://www.live-socks.net/']
HEADERS = {'User-Agent': ua.random}
TIMEOUT = 5
SPIDER_PROXIES = None
IP138 = 'http://2000019.ip138.com/'
def echo(color, *args):
colors = {'error': '\033[91m', 'success': '\033[94m', 'info': '\033[93m'}
if not color in colors or platform.system() == 'Windows':
print(' '.join(args))
print(colors[color], ' '.join(args), '\033[0m')
def get_content(url, proxies=None) -> str:
''' 根据URL和代理获得内容 '''
echo('info', url)
try:
r = requests.get(url, headers=HEADERS, proxies=proxies, timeout=TIMEOUT)
if r.status_code == requests.codes.ok:
return r.text
echo('error', '请求失败', str(r.status_code), url)
except Exception as e:
echo('error', url, str(e))
return ''
def get_proxies_thread(site, proxies):
''' 爬取一个站的代理的线程 '''
content = get_content(site, SPIDER_PROXIES)
pages = re.findall(r'<h3[\s\S]*?<a.*?(http.*?\.html).*?</a>', content)
for page in pages:
content = get_content(page, SPIDER_PROXIES)
proxies += re.findall(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\d{1,5}', content)
def get_proxies_set() -> list:
''' 获得所有站的代理并去重 '''
spider_pool, proxies = [], []
for site in SITES:
t = threading.Thread(target=get_proxies_thread, args=(site, proxies))
spider_pool.append(t)
t.start()
for t in spider_pool:
t.join()
return list(set(proxies))
def check_proxies_thread(check_url, proxies, callback):
''' 检查代理是否有效的线程 '''
for proxy in proxies:
proxy = proxy.strip()
proxy = proxy if proxy.startswith('http://') else 'http://' + proxy
content = get_content(check_url, proxies={'http': proxy})
if content:
if check_url == IP138:
# 如果能获取到IP,则比对一下IP和代理所用IP一致则判断有效
ip = re.findall(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}', content)
if ip and ip[0] in proxy:
callback(proxy)
else:
callback(proxy)
def check_and_save_proxies(check_url, proxies, output_file):
''' 验证和保存所有代理 '''
checker_pool = []
open(output_file, 'w').write('')
def save_proxy(proxy):
echo('success', proxy, 'checked ok.')
open(output_file, 'a').write(proxy + '\n')
for i in range(0, len(proxies), 20):
t = threading.Thread(target=check_proxies_thread, args=(check_url, proxies[i:i+20], save_proxy))
checker_pool.append(t)
t.start()
for t in checker_pool:
t.join()
def proxy_run(check_url=IP138):
input_file, output_file = '', 'proxies.txt'
# input_file, output_file, check_url = '', 'proxies.txt', IP138
# if len(sys.argv) > 1:
# try:
# opts, _ = getopt.getopt(sys.argv[1:], 'u:f:o:')
# except getopt.GetoptError as e:
# echo('error', str(e))
# sys.exit(2)
# for o, a in opts:
# if o in ('-f'): input_file = os.path.abspath(a)
# elif o in ('-u'): check_url = a
# elif o in ('-o'): output_file = os.path.abspath(a)
# else: assert False, 'unhandled option'
start = datetime.datetime.now()
proxies = open(input_file, 'r').readlines() if input_file else get_proxies_set()
check_and_save_proxies(check_url, proxies, output_file)
stop = datetime.datetime.now()
note = '\n代理总数:%s\n有效代理数:%s\n结果文件:%s\n时间消耗:%s\n' % \
(len(proxies), len(open(output_file, 'r').readlines()),
output_file, stop - start)
echo('success', note)
if __name__ == "__main__":
# proxy_run("https://www.taoguba.com.cn/quotes/sz000651")
proxy_run()
后记
更新时间;2020-01-10
更多推荐
已为社区贡献6条内容
所有评论(0)