python html解析
下面以获取IP地址的物理位置作简要记录,以备后用:#!/usr/bin/env python# -*- coding:utf-8 -*-import urllib2, HTMLParser, reclass IPParser(HTMLParser.HTMLParser):def __init__(self):HTMLParser.HTMLParser.__init__(
·
下面以获取IP地址的物理位置作简要记录,以备后用:
#!/usr/bin/env python
# -*- coding:utf-8 -*-
import urllib2, HTMLParser, re
class IPParser(HTMLParser.HTMLParser):
def __init__(self):
HTMLParser.HTMLParser.__init__(self)
url = "http://iframe.ip138.com/ic.asp"
try:
fp = urllib2.urlopen(url, timeout=5)
souce = fp.read()
fp.close()
self.feed(souce)
except:
print "So sorry!"
def handle_starttag(self, tag, attrs):
self.flag = tag
if tag == "meta":
tmp = re.findall("charset=([A-Za-z0-9-]*)", str(attrs))
if tmp:
self.code = tmp[0]
def handle_data(self, data):
if self.flag == "center":
self.info = data#.decode(self.code).encode("UTF-8")
def handle_endtag(self, tag):
if tag == "center":
print self.info.decode(self.code).encode("UTF-8")
if __name__ == "__main__":
IPParser()
更详细的用法请参考官方文档: http://docs.python.org/2/library/htmlparser.html
对于上面的获取方式还可以:
html = urllib2.urlopen("http://iframe.ip138.com/ic.asp").read().decode("GB2312")
print re.findall("<center>(.*)</center>", html)[0].encode("UTF-8")
更多推荐
已为社区贡献2条内容
所有评论(0)