基于已有的第三方网站在线语料切分,分析其获取规则,制作的小小的工具

参数均为 须标注的文章或字符串,返回值均为 标注好词性的内容

链接如下:

语料库在线

传媒语言语料库在线分词标注系统

请合理使用,勿频繁使用它们的接口!

#!/usr/bin/env python
# -*- coding: utf-8 -*-
import pymysql
import requests
import traceback

from lxml import etree

"""
自动分词和词性标注
"""


# 语料库在线
def get_words_zhonghuayuwen(old_str):
    try:
        data = {
            "__VIEWSTATE": "/wEPDwUKMTkxNjQxMjkxOGRk9/66aqWN3F0h8lvlZBxz3uN/OcjS8w7aTPcGVv1a3Jc=",
            "__VIEWSTATEGENERATOR": "B992DC97",
            "__EVENTVALIDATION": "/wEWBQKzsbS2CwK5lIXIBAKTmJvSBQK7q7GGCAKliMfhCycWhRFQfONu2k/cCxuzjQ7heJO8d2RWyCZOiS+faaOE",
            "TBin": old_str,
            # "Button2": "重置",
            "BT1": "自动分词&标注词性",
            "TBout": ""
        }
        cookies = {
            "safedog-flow-item": "",
            # "ASP.NET_SessionId": "azea2df0rrnzwyzbt1o2detw"
            "ASP.NET_SessionId": ""
        }
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36",
        }
        result = requests.post('http://corpus.zhonghuayuwen.org/CpsWParser.aspx', data=data, cookies=cookies, headers=headers)

        selector = etree.HTML(result.text)
        t = selector.xpath('//*[@id="TBout"]/text()')
        if len(t[0]) > len(old_str):
            return t[0]
        else:
            return ""
    except BaseException as e:
        traceback.print_exc()
        return ""


# 传媒语言语料库在线分词标注系统
def get_words_cuc_edu(old_str):
    """
    使用固定的cookie访问对方服务器,先生成切分的相关信息,
    再求去另一个链接获取结果
    :param old_str: 需切分的字符串
    :return: 切分后的值,如切分失败,则返回 ""
    """
    try:
        url = "http://ling.cuc.edu.cn/cucseg/"
        data = {
            "inputText": old_str,
            "mergeflag": "Merge",  # 细粒度 NotMerge
            "CateOption": "CateOne",
            "CateSet": "CateSet_PKU",  # 北大
            "__VIEWSTATEGENERATOR": "2F03AC06",
            "__EVENTVALIDATION": "/wEdAA2XBlrAenctEnRFS8xXzf6oUtYjgVic9VlzzV6C3Yw6HWK9YLSmwuh7cMftZMmFYep1Fa2hVO0mzKQ98ubp+dlvevIhDNyvshAzFCIkltU2faiwmaLGd4riX1glX/OCIWvHYiBC2I7LpwHqgiAWk5KO85pTRlXyJ29DlwQaO4HLDlaby0IY9gFdVynqGKYNG9wRCYCYrvJ3/wvbK0TQDiD0acOuqFV82Hf03hsNZIYy5364rc2Pa+QK6kiAwoGE5wESnpCbqqoGAZvwGZn0cUQOzYPghECYHysrOvPTK6g7UnWRAia77SScJaporBAq38A=",
            "__VIEWSTATE": "/wEPDwUKLTE5ODQ1MDUyMA9kFgJmD2QWBgIXDw9kFgIeB29uY2xpY2sFFHJldHVybiBDbGVhcl9UZXh0KCk7ZAIZDw9kFgIfAAUZdGhpcy5mb3JtLnRhcmdldD0nX2JsYW5rJ2QCGw8PFgIeBFRleHQFBjE4ODY5N2RkGAEFHl9fQ29udHJvbHNSZXF1aXJlUG9zdEJhY2tLZXlfXxYNBQVNZXJnZQUITm90TWVyZ2UFCE5vdE1lcmdlBQdDYXRlT25lBQdDYXRlQWxsBQdDYXRlQWxsBQpjaGtBbGxDQ0FUBQhDYXRlTm91bgUIQ2F0ZU5vdW4FC0NhdGVTZXRfQ1VDBQtDYXRlU2V0X1BLVQULQ2F0ZVNldF9QS1UFB2Noa1dhcnBsn1HOQEyzwIsnnjhGS4iT/lr/ODqlSHztISFOSepRpg==",
            "chkWarp": "on",
            "btnSend": "切 分"
        }
        cookies = {
            "ASP.NET_SessionId": "5kl5zrm0seotqhwt4gscr3yy"
        }
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36",
        }
        requests.post(url=url, data=data, cookies=cookies, headers=headers, allow_redirects=True)

        result = requests.get('http://ling.cuc.edu.cn/cucseg/showResult.aspx', cookies=cookies)

        selector = etree.HTML(result.text)
        t = selector.xpath('//*[@id="tboxOutText"]/text()')
        if len(t[0]) > len(old_str):
            return t[0]
        else:
            print('error')
            return ""
    except BaseException as e:
        print(e.args)
        traceback.print_exc()
        return ""

 

Logo

CSDN联合极客时间,共同打造面向开发者的精品内容学习社区,助力成长!

更多推荐