Python怎么自建ip代理池
先找个免费代理的网站https://www.kuaidaili.com/free/inha/,再爬所有的ip。因为网站本身就会防爬虫,所以就用刚获取到的ip逐个尝试是否可用,找到可用ip后,就用这个ip代理继续爬下一页。#!/usr/bin/env python#-*- coding:utf-8 -*-import requestsfrom bs4 import BeautifulSoupimpo
·
先找个免费代理的网站https://www.kuaidaili.com/free/inha/,再爬所有的ip。因为网站本身就会防爬虫,所以就用刚获取到的ip逐个尝试是否可用,找到可用ip后,就用这个ip代理继续爬下一页。
#!/usr/bin/env python
#-*- coding:utf-8 -*-
import requests
from bs4 import BeautifulSoup
import pymysql
import time
global ip_num
ip_num = 1
base_url = "https://www.kuaidaili.com/free/inha/"
headers = {"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36"}
proxies={'http':'110.243.20.23:9999'}
def proxies_switch(url):
print("正在进行ip的切换...")
global ip_num
status = False
while status ==False: #找到合适的ip——post地址
print("正在验证第%s个ip地址"%(ip_num))
sql = "select ip,post from ip_proxy where id = %s" % (ip_num)
cursor.execute(sql)
items = cursor.fetchall()
ip = items[0][0]
post = items[0][1]
ip_post = ip+":"+post
response = requests.get(url,proxies = {'http':ip_post})
status = response.ok
ip_num = ip_num+1
proxies['http'] = ip_post
print("第%d个ip地址测试成功"%(ip_num))
time.sleep(1) #切换成功之后sleep一秒 防止新的ip_post被封
def reptile_ip(url):
list = []
html = requests.get(url,headers = headers,proxies = proxies)
print("连接情况为",html.ok)
if html.ok == False:
proxies_switch(url)
html = requests.get(url, headers=headers, proxies=proxies)
soup = BeautifulSoup(html.content,"html.parser")
items = soup.find("table",class_ = "table table-bordered table-striped").find_all("tr")
items =items[1:]
for i in items:
book = {}
book["ip"] = i.find_all("td")[0].text
book["post"] = i.find_all("td")[1].text
book["type"] = i.find_all("td")[3].text
book["place"] = i.find_all("td")[4].text
book["response_time"] = i.find_all("td")[5].text
list.append(book)
return list
if __name__ == "__main__":
url = base_url
conn = pymysql.connect("数据库ip",user="用户名",passwd = "密码",db ="数据名")
cursor = conn.cursor()
cursor.execute("drop table if exists ip_proxy")
createtab = """create table ip_proxy(
id integer NOT NULL auto_increment PRIMARY KEY,
ip char(50) not null ,
post char(20) not null ,
type char(20) not null,
place char(50)not null,
response_time char(20) not null)"""
sql = "insert into ip_proxy (ip,post,type,place,response_time) values (%s,%s,%s,%s,%s)"
cursor.execute(createtab)
for i in range(1,70):
print(i)
lists = reptile_ip(url+str(i))
for list in lists:
try:
cursor.execute(sql, (list["ip"], list["post"], list["type"], list["place"], list["response_time"]))
conn.commit()
print(str(list["ip"])+"has been keeped")
except:
conn.rollback()
更多推荐



所有评论(0)