前言

人生苦短,我用python

实现

爬取目标地址:

https://www.kuaidaili.com/free/inha/1

全部代码

import requests
import re

#目标地址
url = "https://www.kuaidaili.com/free/inha/1"
#头信息User-Agent:Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36
headers = {
     "User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36"
}
#get请求到的数据
response = requests.get(url,headers=headers)
#获取源代码
html = response.text
#re.S换行的干扰
#正则匹配到源代码中的IP和端口
ips = re.findall("<td data-title=.IP.>(\d+\.\d+\.\d+\.\d+)</td>",html,re.S)
ports = re.findall("<td data-title=.PORT.>(\d+)</td>",html,re.S)
#循环交给IP
for ip in zip(ips,ports):
    proxies = {
        "http":"http://" + ip[0] + ":" + ip[1],
        "https":"http://" + ip[0] + ":" + ip[1],
    }
    #异常处理
    try:    
        res = requests.get("https://www.baidu.com/",proxies=proxies,timeout=2)
        print(ip,"能使用")
        #在当前目录生成一个txt
        with open("ip.txt",mode="a+") as f:
        	#用:分开ip中值
            f.write(":" . join(ip))
            f.write("\n")
    #当访问百度错误
    except Exception as e:
        print(ip,"不能使用")
Logo

CSDN联合极客时间,共同打造面向开发者的精品内容学习社区,助力成长!

更多推荐