中文语料 自动分词、标注词性 工具
基于已有的第三方网站在线语料切分,分析其获取规则,制作的小小的工具参数均为 须标注的文章或字符串,返回值均为 标注好词性的内容链接如下:语料库在线传媒语言语料库在线分词标注系统请合理使用,勿频繁使用它们的接口!#!/usr/bin/env python# -*- coding: utf-8 -*-import pymysqlimport requestsimpor...
·
基于已有的第三方网站在线语料切分,分析其获取规则,制作的小小的工具
参数均为 须标注的文章或字符串,返回值均为 标注好词性的内容
链接如下:
请合理使用,勿频繁使用它们的接口!
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import pymysql
import requests
import traceback
from lxml import etree
"""
自动分词和词性标注
"""
# 语料库在线
def get_words_zhonghuayuwen(old_str):
try:
data = {
"__VIEWSTATE": "/wEPDwUKMTkxNjQxMjkxOGRk9/66aqWN3F0h8lvlZBxz3uN/OcjS8w7aTPcGVv1a3Jc=",
"__VIEWSTATEGENERATOR": "B992DC97",
"__EVENTVALIDATION": "/wEWBQKzsbS2CwK5lIXIBAKTmJvSBQK7q7GGCAKliMfhCycWhRFQfONu2k/cCxuzjQ7heJO8d2RWyCZOiS+faaOE",
"TBin": old_str,
# "Button2": "重置",
"BT1": "自动分词&标注词性",
"TBout": ""
}
cookies = {
"safedog-flow-item": "",
# "ASP.NET_SessionId": "azea2df0rrnzwyzbt1o2detw"
"ASP.NET_SessionId": ""
}
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36",
}
result = requests.post('http://corpus.zhonghuayuwen.org/CpsWParser.aspx', data=data, cookies=cookies, headers=headers)
selector = etree.HTML(result.text)
t = selector.xpath('//*[@id="TBout"]/text()')
if len(t[0]) > len(old_str):
return t[0]
else:
return ""
except BaseException as e:
traceback.print_exc()
return ""
# 传媒语言语料库在线分词标注系统
def get_words_cuc_edu(old_str):
"""
使用固定的cookie访问对方服务器,先生成切分的相关信息,
再求去另一个链接获取结果
:param old_str: 需切分的字符串
:return: 切分后的值,如切分失败,则返回 ""
"""
try:
url = "http://ling.cuc.edu.cn/cucseg/"
data = {
"inputText": old_str,
"mergeflag": "Merge", # 细粒度 NotMerge
"CateOption": "CateOne",
"CateSet": "CateSet_PKU", # 北大
"__VIEWSTATEGENERATOR": "2F03AC06",
"__EVENTVALIDATION": "/wEdAA2XBlrAenctEnRFS8xXzf6oUtYjgVic9VlzzV6C3Yw6HWK9YLSmwuh7cMftZMmFYep1Fa2hVO0mzKQ98ubp+dlvevIhDNyvshAzFCIkltU2faiwmaLGd4riX1glX/OCIWvHYiBC2I7LpwHqgiAWk5KO85pTRlXyJ29DlwQaO4HLDlaby0IY9gFdVynqGKYNG9wRCYCYrvJ3/wvbK0TQDiD0acOuqFV82Hf03hsNZIYy5364rc2Pa+QK6kiAwoGE5wESnpCbqqoGAZvwGZn0cUQOzYPghECYHysrOvPTK6g7UnWRAia77SScJaporBAq38A=",
"__VIEWSTATE": "/wEPDwUKLTE5ODQ1MDUyMA9kFgJmD2QWBgIXDw9kFgIeB29uY2xpY2sFFHJldHVybiBDbGVhcl9UZXh0KCk7ZAIZDw9kFgIfAAUZdGhpcy5mb3JtLnRhcmdldD0nX2JsYW5rJ2QCGw8PFgIeBFRleHQFBjE4ODY5N2RkGAEFHl9fQ29udHJvbHNSZXF1aXJlUG9zdEJhY2tLZXlfXxYNBQVNZXJnZQUITm90TWVyZ2UFCE5vdE1lcmdlBQdDYXRlT25lBQdDYXRlQWxsBQdDYXRlQWxsBQpjaGtBbGxDQ0FUBQhDYXRlTm91bgUIQ2F0ZU5vdW4FC0NhdGVTZXRfQ1VDBQtDYXRlU2V0X1BLVQULQ2F0ZVNldF9QS1UFB2Noa1dhcnBsn1HOQEyzwIsnnjhGS4iT/lr/ODqlSHztISFOSepRpg==",
"chkWarp": "on",
"btnSend": "切 分"
}
cookies = {
"ASP.NET_SessionId": "5kl5zrm0seotqhwt4gscr3yy"
}
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36",
}
requests.post(url=url, data=data, cookies=cookies, headers=headers, allow_redirects=True)
result = requests.get('http://ling.cuc.edu.cn/cucseg/showResult.aspx', cookies=cookies)
selector = etree.HTML(result.text)
t = selector.xpath('//*[@id="tboxOutText"]/text()')
if len(t[0]) > len(old_str):
return t[0]
else:
print('error')
return ""
except BaseException as e:
print(e.args)
traceback.print_exc()
return ""
更多推荐
已为社区贡献2条内容
所有评论(0)