python爬虫——中国大学排名详细教程（全注释）

使用步骤：安装python下载 requests 库和 BeautifulSoup 库复制代码按需求修改（或保持不变）运行import requestsfrom bs4 import BeautifulSoupimport bs4def getHTMLText(url):try:r = requests.get(url, timeout=3...

怼你嶶笑纯屬礼貌

2313人浏览 · 2019-10-21 19:01:48

怼你嶶笑纯屬礼貌 · 2019-10-21 19:01:48 发布

使用步骤：

安装python
下载 requests 库和 BeautifulSoup 库
复制代码按需求修改（或保持不变）
运行

import requests
from bs4 import BeautifulSoup
import bs4

def getHTMLText(url):
    try:
        r = requests.get(url, timeout=30)   #获取网页
        r.raise_for_status()                #通过r.raise_for_status()来抛出异常
        r.encoding = r.apparent_encoding    #r.encoding 从HTTP header中猜测响应内容的编码方式
                                            #r.apparent_encoding 从内容中分析出响应内容编码方式
        return r.text                       #获得文本
    except:
        return ""


def fillUnivList(ulist,html):
    soup = BeautifulSoup(html,"html.parser")    #解析
    for tr in soup.find('tbody').children:      #soup.find('tbody').children 查找html中tbody标签的所有<tr>子标签，是所有的<tr>
                                                #soup.find_all('tbody')  查找html中所有'tbody'标签；
                                                #soup.find('tbody') 查找html文档中第一个tbody标签
        #<img src = 'http://www.abc.com/123.jpg'>
        # tag.name 标签名称提取
        #tag.get('attrs')  tag标签中属性内容提取，比如图片链接提取
        #tag.string   tag标签中所有字符串提取

        #判断两个类型是否相同推荐使用 isinstance()  即判断tr的类型是否是 bs4.element.Tag  检测标签类型，如果不是bs4库支持的Tag类型，就过滤掉 
        if isinstance(tr, bs4.element.Tag):
            tds=tr('td')                         # 解析出tr标签中的td标签后，将其储存在列表tds中   查看页面源代码 发现一个tr中含有3个td
            ulist.append([tds[0].string, tds[1].string, tds[2].string])



def printUnivList(ulist,num):
    tplt = "{0:^10}\t{1:{3}^10}\t{2:^10}"                   #python 输出格式 这个是format方法的格式控制
    print(tplt.format("排名","学校名称","总分",chr(12288))) #chr(12288)填充中文空格
    file = open(r'D:\python\titles.txt', 'w')                      #打开txt文档
    for i in range(num):
        u = ulist[i]
        print(tplt.format(u[0],u[1],u[2],chr(12288)))
        file.write(tplt.format(u[0],u[1],u[2],chr(12288))+ '\n')    #将内容写进文档
    file.close()    #关闭


def main():
    uinfo = []
    url = "http://www.zuihaodaxue.cn/zuihaodaxuepaiming2016.html"
    html = getHTMLText(url)
    fillUnivList(uinfo, html)
    printUnivList(uinfo, 20) # 控制爬取条数

main()

CSDN学习社区

CSDN联合极客时间，共同打造面向开发者的精品内容学习社区，助力成长！

更多推荐