在Python中使用Xpath进行数据爬取的案例
#!/usr/bin/evn python# -*- coding:utf-8 -*-import urllib2import urllibfrom lxml import etreeclass proxyObj:proxyService="";proxyPort=""proxyHttp="http"proxy_headers={&quo
#!/usr/bin/evn python
# -*- coding:utf-8 -*-
import urllib2
import urllib
from lxml import etree
class proxyObj:
proxyService="";
proxyPort=""
proxyHttp="http"
proxy_headers={
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.92 Safari/537.36"
};
url="http://www.xicidaili.com/nn/"
proxy_test_url="http://www.baidu.com/"
proxyLists=[]
enableProxy=[]
disableProxy=[]
for i in range(0,3):
page=(i+1);
tempurl=url+str(page);
print(tempurl)
request=urllib2.Request(url=tempurl,headers=proxy_headers);
response=urllib2.urlopen(request);
if response.getcode()==200:
htmlcontent=response.read();
htmlobj=etree.HTML(htmlcontent);
# 根据使用xpath的匹配规则把table下面的tr全部拿出来
htmldata=htmlobj.xpath("//table[@id='ip_list']//tr");
print("htmldata--size:"+str(len(htmldata)))
# 从拿到的行里,分别拿到ip集合,端口集合,协议集合。
ips=htmldata[0].xpath("//td[2]//text()");
ports=htmldata[0].xpath("//td[3]//text()");
https=htmldata[0].xpath("//td[6]//text()");
for i in range(0,len(ips)):
proxy=proxyObj()
proxy.proxyService=ips[i]
proxy.proxyPort=ports[i]
proxy.proxyHttp=https[i]
proxyLists.append(proxy);
def proxyTest(proxys):
if type(proxys) is list and len(proxys)>0:
for proxy in proxys:
proxyInfo={proxy.proxyHttp:proxy.proxyService+":"+proxy.proxyPort}
#构造代理的Handler
proxyHandler=urllib2.ProxyHandler(proxyInfo)
#构造代理的opener
opener=urllib2.build_opener(proxyHandler)
#构造一个测试的Request
request=urllib2.Request(url=proxy_test_url,headers=proxy_headers);
try:
#发送请求,并设置语法的超时时间
response = opener.open(request,timeout=10);
if response.getcode()==200:
enableProxy.append(proxy)
else:
disableProxy.append(proxy)
except:
disableProxy.append(proxy)
continue;
print("************start************************")
print("抓取的代理总数有:"+str(len(proxyLists)))
print("准备进入测代理测试程式............")
proxyTest(proxyLists);
print("测代理测试程式完成............")
print("正在生成测代理报告............")
print("测代理报告:............")
print("+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
print("+ 抓取的代理总数有:+"+str(len(proxyLists))+"++++++++++++++++++++++++++++++++++++++")
print("+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
print("+ 抓取的代理可用数有:+"+str(len(enableProxy))+"++++++++++++++++++++++++++++++++++++++")
print("+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
print("+ 抓取的代理不可用数有:+"+str(len(disableProxy))+"++++++++++++++++++++++++++++++++++++++")
print("+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
print("************start************************")
with open("proxy.txt","a+") as f:
for obj in enableProxy:
f.write(""+obj.proxyService+" "+obj.proxyPort+" "+obj.proxyHttp)
更多推荐
所有评论(0)