今天临时需要爬取一些双语资料

(尚未清洗)

需要充分利用

下边代码是想拿到Chinadaily网页中每篇双语新闻的链接,首先研究这些网页的网址和网页结构,包括翻页一般是首页网址加上_2,_3...等等。所以以下代码只是拿到链接。

#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""
File: bi_news.py
Author: ZhangHaiou(hozhangel@126.com)
Date: 2018/05/04
"""

import urllib
import re
import os

bi_urls = []
def getHtml(url):    #读取网页内容
    page = urllib.urlopen(url)
    html = page.readlines()
    #print html
    return html

def getImg(html):
    reg = r'src="(.+?\.jpg)" pic_ext'
    imgre = re.compile(reg)
    imglist = re.findall(imgre,html)
    x = 0
    for imgurl in imglist:
        urllib.urlretrieve(imgurl,'%s.jpg' % x)
        x+=1
    
def geturl(html):   #读取网页中需要的链接
    for line in html:
        if re.search('\<div class="mr10"\>\<a href="\d\d\d\d\-\d\d/\d\d/content\_\d{4,}.htm"',line):
            if re.search('\<div class="mr10"\>\<a href="2016\-\d\d/\d\d/content\_\d{4,}.htm"',line):        #只是想拿到2016年之后的语料      
                os._exit(0)
            else:
                url = re.findall(r'\d\d\d\d\-\d\d/\d\d/content\_\d{4,}.htm',line)
                print("http://language.chinadaily.com.cn/" + url[0])
                bi_urls.append("http://language.chinadaily.com.cn/" + url[0])

                
if __name__ == '__main__':        
    n = 1
    # os.system('wget -r --spider http://language.chinadaily.com.cn/news_bilingual.html')
    # #geturl(getHtml("http://language.chinadaily.com.cn/news_bilingual.html"))
    # '''
    while n:
        if(n < 2):
            html = getHtml("http://language.chinadaily.com.cn/news_bilingual.html")
            
        elif(n > 1):
            html = getHtml("http://language.chinadaily.com.cn/news_bilingual_" + str(n) + ".html" )
        geturl(html)
        n = n + 1

 

执行python bi_news.py >url.txt 把想要的网址保存

url.txt内容:

 

下一步是简单爬取把url中每行链接的网页内容,且把新闻按照月份整理进入文件夹,文件名是每个新闻链接的后面八位数字

 

#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""
File: content.py
Author: ZhangHaiou(hozhangel@126.com)
Date: 2018/05/04
"""

import urllib
import re
import os
import sys
bi_urls = []
def getHtml(url):
    page = urllib.urlopen(url)
    html = page.read()
    #print html
    return html

def getImg(html):
    reg = r'src="(.+?\.jpg)" pic_ext'
    imgre = re.compile(reg)
    imglist = re.findall(imgre,html)
    x = 0
    for imgurl in imglist:
        urllib.urlretrieve(imgurl,'%s.jpg' % x)
        x+=1
    
def geturl(html):
    for line in html:
        if re.search('\<div class="mr10"\>\<a href="\d\d\d\d\-\d\d/\d\d/content\_\d{4,}.htm"',line):
            if re.search('\<div class="mr10"\>\<a href="2016\-\d\d/\d\d/content\_\d{4,}.htm"',line):                
                os._exit(0)
            else:
                url = re.findall(r'\d\d\d\d\-\d\d/\d\d/content\_\d{4,}.htm',line)
                print(url)
                bi_urls.append(url)
def savefile(savepath, content):
    with open(savepath, "w") as fp:
        fp.write(content)
                
if __name__ == '__main__':        

    for line in open(sys.argv[1],'r'):
        content = ""
        n = 1
        while n: #这个循环是为了不遗漏需要翻页的新闻
            if n > 1:
                htm = line + "_" + str(n)
            else:
                htm = line
            raw = getHtml(htm)
            
            if not re.findall(r'<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">',raw): #避免空白页
                break
            print(htm)
            n = n + 1
            # for hang in raw:
                # if re.search('^\<p\>.*\<\/p\>',hang):
            content = content + raw
        date = re.findall(r'\d\d\d\d\-\d\d',line)[0]
        filename = re.findall(r'\d{6,}',line)[0]
        if not os.path.exists(date):  # 是否存在目录
            os.makedirs(date)
        savefile(date + "/" + filename + ".txt" , content)
        
      

 

Logo

CSDN联合极客时间,共同打造面向开发者的精品内容学习社区,助力成长!

更多推荐