Python 爬虫处理字体加密

汽车之家:

#!/usr/bin/env python
# -*- coding: utf-8 -*-

import requests
from lxml import etree
import re
import sys
import io
from fontTools.ttLib import TTFont

sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='gb18030')


# 抓取autohome评论
class AutoSpider:
    # 页面初始化
    def __init__(self):
        self.headers = {
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
            "Accept-Encoding": "gzip, deflate, br",
            "Accept-Language": "zh-CN,zh;q=0.8",
            "Cache-Control": "max-age=0",
            "Connection": "keep-alive",
            "Upgrade-Insecure-Requests": "1",
            "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.86 Safari/537.36",
            'host': 'club.autohome.com.cn',
            'cookie': '__ah_uuid=C526DAD3-76F6-42C8-956B-4CBE18611E7B; fvlid=1545293100124hzVSzLWmuB; sessionip=61.149.5.137; area=119999; sessionid=60F897CD-E743-449D-BEBE-44D6533DE992%7C%7C2018-12-20+16%3A05%3A06.291%7C%7Cwww.baidu.com; ahpau=1; sessionuid=60F897CD-E743-449D-BEBE-44D6533DE992%7C%7C2018-12-20+16%3A05%3A06.291%7C%7Cwww.baidu.com; pbcpopclub=0fb65f3c-0c57-43b3-ade4-2752c0517737; ref=www.baidu.com%7C0%7C0%7C0%7C2018-12-20+17%3A56%3A26.073%7C2018-12-20+16%3A05%3A06.291; autoac=DB6482147B98F5F9B9D0834939744526; autotc=3A5FEFF1E14636EA47902DA601BB1DF6; ahpvno=12'}
        # 获取评论

    def getNote(self):
        url = "https://club.autohome.com.cn/bbs/thread-c-2778-69436529-1.html"
        # 获取页面内容
        r = requests.get(url, headers=self.headers)
        html = etree.HTML(r.text)
        # 匹配ttf font
        cmp = re.compile(",url\('(//.*.ttf)'\)")
        rst = cmp.findall(r.text)
        ttf = requests.get("http:" + rst[0], stream=True)
        with open("autohome.ttf", "wb") as pdf:
            for chunk in ttf.iter_content(chunk_size=1024):
                if chunk:
                    pdf.write(chunk)
        # 解析字体库font文件
        font = TTFont('autohome.ttf')
        uniList = font['cmap'].tables[0].ttFont.getGlyphOrder()
        utf8List = [str(uni[3:]) for uni in uniList[1:]]
        wordList = ['一', '七', '三', '上', '下', '不', '中', '档', '比', '油', '泥', '灯',
                    '九', '了', '二', '五', '低', '保', '光', '八', '公', '六', '养', '内', '冷',
                    '副', '加', '动', '十', '电', '的', '皮', '盘', '真', '着', '路', '身', '软',
                    '过', '近', '远', '里', '量', '长', '门', '问', '只', '右', '启', '呢', '味',
                    '和', '响', '四', '地', '坏', '坐', '外', '多', '大', '好', '孩', '实', '小',
                    '少', '短', '矮', '硬', '空', '级', '耗', '雨', '音', '高', '左', '开', '当',
                    '很', '得', '性', '自', '手', '排', '控', '无', '是', '更', '有', '机', '来']
        print(utf8List)
        # 获取发帖内容
        text = html.xpath("string(//div[@class='tz-paragraph'])")
        # note = [ii.replace("\r", "").replace("\n", "") for ii in text]
        # notes = [i.replace("\\u", "") for i in note]
        # print(notes)
        for i in range(len(utf8List)):
            text = text.replace(utf8List[i], wordList[i])
        print(text)


spider = AutoSpider()
spider.getNote()

猫眼:

#!/usr/bin/env python
# -*- coding: utf-8 -*-

import re
import requests
from fontTools.ttLib import TTFont
from lxml import etree


def job():
    headers = {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) "
                      "Chrome/66.0.3359.139 Safari/537.36 "
    }

    index_url = 'http://maoyan.com/'
    # 获取首页内容
    response_index = requests.get(index_url, headers=headers).text
    index_xml = etree.HTML(response_index)
    info_list = index_xml.xpath('//*[@id="app"]/div/div[1]/div[1]/div/div[2]/ul/li[1]/a/div[2]/div//text()')
    a = u'电影名称:%s, 票房总数:%s' % (info_list[1], info_list[4])
    print(a)

    # 获取字体文件的url
    woff_ = re.search(r"url\('(.*\.woff)'\)", response_index).group(1)
    woff_url = 'http:' + woff_
    response_woff = requests.get(woff_url, headers=headers).content

    with open('fonts.woff', 'wb') as f:
        f.write(response_woff)

    # base_nums, base_fonts 需要自己手动解析映射关系, 要和basefonts.woff一致
    baseFonts = TTFont('basefonts.woff')
    base_nums = ['7', '9', '0', '3', '6', '5', '2', '1', '4', '8']
    base_fonts = ['uniF04C', 'uniE374', 'uniF426', 'uniEAAA', 'uniF519', 'uniEEC4', 'uniF543', 'uniF7C7', 'uniF046',
                  'uniF08E']

    onlineFonts = TTFont('fonts.woff')
    # onlineFonts.saveXML('test.xml')
    uni_list = onlineFonts.getGlyphNames()[1:-1]
    temp = {}
    # 解析字体库
    for i in range(10):
        onlineGlyph = onlineFonts['glyf'][uni_list[i]]
        for j in range(10):
            baseGlyph = baseFonts['glyf'][base_fonts[j]]
            if onlineGlyph == baseGlyph:
                temp["&#x" + uni_list[i][3:].lower() + ';'] = base_nums[j]

    # 字符替换
    pat = '(' + '|'.join(temp.keys()) + ')'
    response_index = re.sub(pat, lambda x: temp[x.group()], response_index)

    # 内容提取
    index_xml = etree.HTML(response_index)
    info_list = index_xml.xpath('//*[@id="app"]/div/div[1]/div[1]/div/div[2]/ul/li[1]/a/div[2]/div//text()')
    a = u'电影名称:%s, 票房总数:%s' % (info_list[1], info_list[4])
    print(a)


def ttf_to_xml():
    onlineFonts = TTFont('base.woff')
    onlineFonts.saveXML('base.xml')

去哪儿网:

#!/usr/bin/env python
# -*- coding: utf-8 -*-

from fontTools.ttLib import TTFont
import requests
from datetime import datetime
import json

phone_headers = {
    'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1'}

web_headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36'
}


def job():
    # 这个是与上面的字体文件相对应的
    number_dict = {
        "period": ".",
        "zero": "0",
        "one": "1",
        "two": "2",
        "three": "3",
        "four": "4",
        "five": "5",
        "six": "6",
        "seven": "7",
        "eight": "8",
        "nine": "9"
    }
    font_url = get_font_ttf()
    dd = font_url.split('/')[-2:]
    name = dd[0] + dd[1]
    font_content = requests.get(font_url, headers=web_headers).content
    # print(font_content)
    with open(name, 'wb') as f:
        f.write(font_content)
    font = TTFont(name)
    font.saveXML(name.replace("ttf", 'xml'))


def get_font_ttf():
    post_data = {"arrCity": "上海",
                 "depCity": "北京",
                 "flightType": "oneWay",
                 "from": "touch_index_guess",
                 "goDate": datetime.now().strftime('%Y-%m-%d'),
                 "sort": "1",
                 "firstRequest": "true",
                 "startNum": 0,
                 "r": 1544747204962,
                 "_v": 2,
                 "underageOption": "",
                 "__m__": "09163ba3379128886841f72d76aa525e"}

    post_data2 = {
        'arrCity': "上海",
        'baby': "0",
        'cabinType': "0",
        'child': "0",
        'depCity': "北京",
        'firstRequest': 'true',
        'from': "touch_index_search",
        'goDate': datetime.now().strftime('%Y-%m-%d'),
        'r': 1544750638857,
        'sort': 5,
        'startNum': 0,
        'underageOption': "",
        '__m__': "fa4863f52526dbbe3b3cba0e3de7e006",
        '_v': 2
    }

    data = requests.post('https://m.flight.qunar.com/touch/api/domestic/flightlist', data=post_data2)
    dd = json.loads(data.text)
    font_src = "https:" + dd['data']['obfuscate']['fontSrc']
    print(font_src)
    return font_src


if __name__ == '__main__':
    job()

猫眼网站:

#!/usr/bin/env python
# -*- coding: utf-8 -*-

import requests
from lxml import html
import re
import woff2otf
from fontTools.ttLib import TTFont
from bs4 import BeautifulSoup as bs


# 抓取maoyan票房
class MaoyanSpider:
    # 页面初始化
    def __init__(self):
        self.headers = {
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
            "Accept-Encoding": "gzip, deflate, br",
            "Accept-Language": "zh-CN,zh;q=0.8",
            "Cache-Control": "max-age=0",
            "Connection": "keep-alive",
            "Upgrade-Insecure-Requests": "1",
            "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.86 Safari/537.36"
        }

    # 获取票房
    def getNote(self):
        url = "http://maoyan.com/cinema/15887?poi=91871213"
        host = {'host': 'maoyan.com',
                'refer': 'http://maoyan.com/news', }
        headers = dict(self.headers.items() + host.items())
        # 获取页面内容
        r = requests.get(url, headers=headers)
        # print r.text
        response = html.fromstring(r.text)
        u = r.text
        # 匹配ttf font
        cmp = re.compile(",\nurl\('(//.*.woff)'\) format\('woff'\)")
        rst = cmp.findall(r.text)
        ttf = requests.get("http:" + rst[0], stream=True)
        with open("maoyan.woff", "wb") as pdf:
            for chunk in ttf.iter_content(chunk_size=1024):
                if chunk:
                    pdf.write(chunk)
        # 转换woff字体为otf字体
        woff2otf.convert('maoyan.woff', 'maoyan.otf')
        # 解析字体库font文件
        baseFont = TTFont('base.otf')
        maoyanFont = TTFont('maoyan.otf')
        uniList = maoyanFont['cmap'].tables[0].ttFont.getGlyphOrder()
        numList = []
        baseNumList = ['.', '3', '5', '1', '2', '7', '0', '6', '9', '8', '4']
        baseUniCode = ['x', 'uniE64B', 'uniE183', 'uniED06', 'uniE1AC', 'uniEA2D', 'uniEBF8',
                       'uniE831', 'uniF654', 'uniF25B', 'uniE3EB']
        for i in range(1, 12):
            maoyanGlyph = maoyanFont['glyf'][uniList[i]]
            for j in range(11):
                baseGlyph = baseFont['glyf'][baseUniCode[j]]
                if maoyanGlyph == baseGlyph:
                    numList.append(baseNumList[j])
                    break
        uniList[1] = 'uni0078'
        utf8List = [eval("u'\\u" + uni[3:] + "'").encode("utf-8") for uni in uniList[1:]]
        # 获取发帖内容
        soup = bs(u, "html.parser")
        index = soup.find_all('div', {'class': 'show-list'})

        print('---------------Prices-----------------')
        for n in range(len(index)):
            mn = soup.find_all('h3', {'class': 'movie-name'})
            ting = soup.find_all('span', {'class': 'hall'})
            mt = soup.find_all('span', {'class': 'begin-time'})
            mw = soup.find_all('span', {'class': 'stonefont'})
            for i in range(len(mt)):
                moviename = mn[i].get_text()
                film_ting = ting[i].get_text()
                movietime = mt[i].get_text()
                moviewish = mw[i].get_text().encode('utf-8')
                for i in range(len(utf8List)):
                    moviewish = moviewish.replace(utf8List[i], numList[i])
                print(moviename, film_ting, movietime, moviewish)


spider = MaoyanSpider()
spider.getNote()

 完整代码下载:https://github.com/tanjunchen/SpiderProject/tree/master/fontfaceDecrypt 

Logo

CSDN联合极客时间,共同打造面向开发者的精品内容学习社区,助力成长!

更多推荐