文本情感分析+python+正面和负面新闻+新浪微博+情感字典+机器学习

文本情感分析从上一篇完成了对新浪微博的爬取，以及模拟登录的问题，小编又开始研究对微博文本的正面和反面分析，从网上搜索了好多方法，有机器学习和情感字典，可是机器学习需要比较深的知识链，而小编还是小白，所以就选择了情感字典方法。

逆风更适合_飞翔

23935人浏览 · 2017-04-29 12:44:08

逆风更适合_飞翔 · 2017-04-29 12:44:08 发布

文本情感分析

从上一篇完成了对新浪微博的爬取，以及模拟登录的问题，小编又开始研究对微博文
本的正面和反面分析，从网上搜索了好多方法，有机器学习和情感字典，可是机器学
习需要比较深的知识链，而小编还是小白，所以就选择了情感字典方法。好了，直接
上代码嘛，直接可以运行的。
前提需要安装相关的库jieba,
直接用pip install jieba
就能安装，

一、数据准备

先要准备情感字典，

情感字典，如：
最尼玛 -6.70400012637
扰民 -6.49756445867
fuck… -6.32963390433
RNM -6.21861284426
wcnmlgb -5.96710044003
2.5: -5.90459648251
停用字字典，如：
!，”，#，$，&
副词，如：
百分之百 6
倍加 6
备至 6
不得了 6
不堪 6
不可开交 6
不亦乐乎 6
否定词，如：不，没，无，非，莫，弗，勿
如果需要可以去CSDN去下载，我已经上传了

二、情感分析

#!usr/bin/env python
#coding:utf-8

import jieba

class SentimentAnalysis:
    #初始化
    def __init__(self,sentiment,noword,adverb,stopword):
        self.__readFile(sentiment,noword,adverb,stopword)

    #读取相关文库
    def __readFile(self,sentiment,noword,adverb,stopword):
        self.__sentList = {}
        self.__noword = []
        self.__adverb = {}
        self.__stopword = []
        #情感词
        sentList = open(sentiment,'rb')
        for s in sentList.readlines():
            try:
                s = s.replace('\r\n','').replace('\n','')
                self.__sentList[s.split(' ')[0]] = s.split(' ')[1]
            except:
                pass
        sentList.close()
        nowordList = open(noword,'rb')
        for s in nowordList.readlines():
            try:
                s = s.replace('\r\n','').replace('\n','')
                self.__noword.append(s)
            except:
                print "数据错误："+s
        nowordList.close()
        adverbList = open(adverb,'rb') 
        for s in adverbList.readlines():
            try:
                s = s.replace('\r\n','').replace('\n','')
                self.__adverb[s.split(' ')[0]] = s.split(' ')[1]
            except:
                print "数据错误："+s
        adverbList.close()
        stopwordList = open(stopword,'rb')
        for s in stopwordList.readlines():
            try:
                s = s.replace('\r\n','').replace('\n','')
                self.__stopword.append(s) 
            except:
                print "数据错误："+s
        stopwordList.close()

    def setSentence(self,sentence):
        self.__sentence = sentence.lstrip()
    #预处理
    def preDetail(self):
        wordsList = jieba.cut(self.__sentence, cut_all=False)
        newWords = {}
        i = 0
        for w in wordsList:
            if w not in self.__stopword:
                newWords[str(i)] =w
                i = i+1
        senWord = {}
        notWord = {}
        degreeWord = {}
        m = 0
        for index in newWords.keys():
            if newWords[index] in self.__sentList.keys() and newWords[index] not in self.__noword and newWords[index] not in self.__adverb.keys():
                senWord[index] = self.__sentList[newWords[index].encode('utf-8')]
            elif newWords[index] in self.__noword and newWords[index] not in self.__adverb.keys():
                notWord[index] = -1
            elif newWords[index] in self.__adverb.keys():
                degreeWord[index] = self.__adverb[newWords[index].encode('utf-8')]
            else:
                senWord[index] = 0
        return senWord,notWord,degreeWord,newWords
    def getScore(self):
        senWord,notWord,degreeWord,newWords = self.preDetail()
        W = 1
        score = 0
        # 存所有情感词的位置的列表
        senLoc = []
        notLoc = []
        degreeLoc = []
        for i in senWord.keys():
            senLoc.append(int(i))
        for i in notWord.keys():
            notLoc.append(int(i))
        for i in degreeWord.keys():
            degreeLoc.append(int(i))
        senLoc.sort()
        notLoc.sort()
        degreeLoc.sort()
        senloc = -1

        for i in range(0, len(newWords)):
            # 如果该词为情感词
            if i in senLoc:
                # loc为情感词位置列表的序号
                senloc += 1
                # 直接添加该情感词分数
                score += W * float(senWord[str(i)])
                # print "score = %f" % score
                if senloc < len(senLoc) - 1:
                    # 判断该情感词与下一情感词之间是否有否定词或程度副词
                    # j为绝对位置
                    if senLoc[senloc] - senLoc[senloc + 1] > 1:
                        for j in range(senLoc[senloc]+1, senLoc[senloc + 1]):
                            # 如果有否定词
                            if j in notLoc:
                                W *= -1
                            # 如果有程度副词
                            elif j in degreeLoc:
                                W *= float(degreeWord[j])
                    else:
                        W = 1
            # i定位至下一个情感词
            if senloc < len(senLoc) - 1:
                i = senLoc[senloc + 1]

        return score

def getAnalysis():
    return SentimentAnalysis('情感字典.txt', '否定词.txt', '副词.txt', '停用词.txt')

s = analysis.getAnalysis()
s.setSentence('句子')
#如果分数为正则为正面新闻
#如果位数为负则为负面新闻
print s.getScore()