开启生长之旅!这是我参与「日新计划 12 月更文挑战」的第11天,点击检查活动详情
大连海事大学自然语言处理NLTK运用试验:娴熟运用NLTK进行文本分词,词频计算,词性标示,词性复原,词干提取,并经过不同的特征提取构建情感分类器,不断调整特征提取函数进步情感分类器的精度。
一、试验目的和要求
-
娴熟运用NLTK进行文本分词,词频计算,词性标示,词性复原,词干提取
-
经过不同的特征提取构建情感分类器
-
不断调整特征提取函数进步情感分类器的精度
二、试验原理和内容
-
了解NLTK运用
-
完成根据NLTK和词袋特征的情感剖析模型
-
根据给出分类器完成办法进一步的改善提取特征办法进步分类器的精确度
改善办法参阅如下:
- 特征选择
– ngram会使得特征空间的大小快速添加
– 特征选择旨在开始除掉那些对分类无益的特征
- 考虑词频信息,如TF-IDF
– 特征从一个词是否呈现,变为这个词呈现了多少次。特征的信息含量添加。
- 处理否定词
– 将否定词+形容词,变为一个新的词
- 'not good' -> NOT_good
三、试验设备
Python 3.8,Pycharm,第三方库NLTK及其语料库
四、试验中心代码
4.1根据二元分词关于分类精确率的进步,加入三元分词
def extract_feature1(text):#把每一个词都看作一个特征
feature = {}
text = text.lower()
for word in word_tokenize(text):
feature[word] = True
return feature
def extract_feature2(text):#把每一个形容词都看作一个特征
text = text.lower()
feature = {}
#tokens = list(set(word_tokenize(text))-punctuation_string)
tokens = word_tokenize(text)
for word, pos in pos_tag(tokens):
if pos == 'JJ':
feature[word] = True
return feature
def extract_feature3(text):# 二元分词,把语句自始至终每两个字组成一个词语
text = text.lower()
feature = {}
#tokens = list(set(word_tokenize(text))-punctuation_string)
tokens = word_tokenize(text)
for word in tokens:
feature[word] = True
for bigram in ngrams(tokens, 2):
bigram = ' '.join(bigram)
feature[bigram] = True
return feature
def extract_feature4(text):# 三元分词,把语句自始至终每三个字组成一个词语
text = text.lower()
feature = {}
#tokens = list(set(word_tokenize(text))-punctuation_string)
tokens = word_tokenize(text)
for word in tokens:
feature[word] = True
for trigram in ngrams(tokens, 3):
trigram = ' '.join(trigram)
feature[trigram] =True
#print(feature)
return feature
4.2在以上四个特征函数的基础上去除停用词和标点符号,考虑到!?具有必定的情感属性,将其保存。
def extract_feature1(text):#把每一个词都看作一个特征
feature = {}
text = text.lower()
tokens = list(set(word_tokenize(text)) - stop_words - punctuation_string)
#tokens=word_tokenize(text)
for word in tokens:
feature[word] = True
return feature
def extract_feature2(text):#把每一个形容词都看作一个特征
text = text.lower()
feature = {}
tokens = list(set(word_tokenize(text)) - stop_words - punctuation_string)
#tokens = word_tokenize(text)
for word, pos in pos_tag(tokens):
if pos == 'JJ':
feature[word] = True
return feature
def extract_feature3(text):# 二元分词,把语句自始至终每两个字组成一个词语
text = text.lower()
feature = {}
tokens = list(set(word_tokenize(text))-punctuation_string)
for word in tokens:
feature[word] = True
tokens = word_tokenize(text)
for bigram in ngrams(tokens, 2):
bigram = ' '.join(bigram)
feature[bigram] = True
return feature
def extract_feature4(text):# 三元分词,把语句自始至终每三个字组成一个词语
text = text.lower()
feature = {}
tokens = list(set(word_tokenize(text))-punctuation_string)
for word in tokens:
feature[word] = True
tokens = word_tokenize(text)
for trigram in ngrams(tokens, 3):
trigram = ' '.join(trigram)
feature[trigram] =True
return feature
4.3分词提取词频
def extract_feature1(text):#把每一个词都看作一个特征
feature = {}
text = text.lower()
for word in word_tokenize(text):
feature[word] = feature.get(word,0)+1
return feature
def extract_feature2(text):#把每一个形容词都看作一个特征
text = text.lower()
feature = {}
#tokens = list(set(word_tokenize(text))-punctuation_string)
tokens = word_tokenize(text)
for word, pos in pos_tag(tokens):
if pos == 'JJ':
feature[word] = feature.get(word,0)+1
return feature
def extract_feature3(text):# 二元分词,把语句自始至终每两个字组成一个词语
text = text.lower()
feature = {}
#tokens = list(set(word_tokenize(text))-punctuation_string)
tokens = word_tokenize(text)
for word in tokens:
feature[word] = feature.get(word,0)+1
for bigram in ngrams(tokens, 2):
bigram = ' '.join(bigram)
feature[bigram] = feature.get(bigram,0)+1
return feature
def extract_feature4(text):# 三元分词,把语句自始至终每三个字组成一个词语
text = text.lower()
feature = {}
#tokens = list(set(word_tokenize(text))-punctuation_string)
tokens = word_tokenize(text)
for word in tokens:
feature[word] = feature.get(word,0)+1
for trigram in ngrams(tokens, 3):
trigram = ' '.join(trigram)
feature[trigram] =feature.get(trigram,0)+1
#print(feature)
return feature
4.4分词提取TF-IDF
def extract_feature1(text):#把每一个词都看作一个特征
feature = {}
text = text.lower()
#tokens = list(set(word_tokenize(text)) - stop_words - punctuation_string)
tokens=word_tokenize(text)
for word in tokens:
#print(corpus.tf_idf(word,corpus))
feature[word] = tfidf.get(word)
return feature
def extract_feature2(text):#把每一个形容词都看作一个特征
text = text.lower()
feature = {}
#tokens = list(set(word_tokenize(text)) - stop_words - punctuation_string)
tokens = word_tokenize(text)
for word, pos in pos_tag(tokens):
if pos == 'JJ':
feature[word] = tfidf.get(word)
return feature
def extract_feature3(text):# 二元分词,把语句自始至终每两个字组成一个词语
text = text.lower()
feature = {}
#tokens = list(set(word_tokenize(text))-punctuation_string)
tokens = word_tokenize(text)
for word in tokens:
feature[word] = tfidf.get(word)
tokens = word_tokenize(text)
for bigram in ngrams(tokens, 2):
bigram = ' '.join(bigram)
feature[bigram] = True
return feature
def extract_feature4(text):# 三元分词,把语句自始至终每三个字组成一个词语
text = text.lower()
feature = {}
#tokens = list(set(word_tokenize(text))-punctuation_string)
tokens = word_tokenize(text)
for word in tokens:
feature[word] = tfidf.get(word)
tokens = word_tokenize(text)
for trigram in ngrams(tokens, 3):
trigram = ' '.join(trigram)
feature[trigram] =True
return feature
4.5处理否定词
def extract_feature1(text):#把每一个词都看作一个特征
feature = {}
text = text.lower()
#tokens = list(set(word_tokenize(text)) - stop_words - punctuation_string)
tokens=word_tokenize(text)
for word in tokens:
#print(corpus.tf_idf(word,corpus))
feature[word] = True#tfidf.get(word)
return feature
def extract_feature2(text):#把每一个形容词都看作一个特征
text = text.lower()
feature = {}
#tokens = list(set(word_tokenize(text)) - stop_words - punctuation_string)
tokens = word_tokenize(text)
for word, pos in pos_tag(tokens):
if pos == 'JJ':
feature[word] = True
return feature
def extract_feature3(text):# 二元分词,把语句自始至终每两个字组成一个词语
text = text.lower()
feature = {}
#tokens = list(set(word_tokenize(text))-punctuation_string)
tokens = word_tokenize(text)
for word in tokens:
feature[word] = True
tokens = word_tokenize(text)
for bigram in ngrams(tokens, 2):
if bigram[0]=='not':
bigram = ' '.join(bigram)
feature[bigram] = True
return feature
def extract_feature4(text):# 三元分词,把语句自始至终每三个字组成一个词语
text = text.lower()
feature = {}
tokens = word_tokenize(text)
for word in tokens:
feature[word] = True
tokens = word_tokenize(text)
for trigram in ngrams(tokens, 3):
trigram = ' '.join(trigram)
feature[trigram] =True
return feature
五、试验结果与剖析
各特征提取办法的精确比较:
传统分词 | 提取形容词 | 二元分词 | 三元分词 | |
---|---|---|---|---|
词袋 | 0.7450 | 0.7250 | 0.7775 | 0.7925 |
去停用词、符号 | 0.7400 | 0.6900 | 0.7775 | 0.7925 |
词频 | 0.7600 | 0.7300 | 0.8000 | 0.8050 |
TF-IDF | 0.7450 | 0.7250 | 0.7775 | 0.7925 |
否定词处理 | \ | \ | 0.7500 | \ |
从自然语言处理的过程上理解,添加特征与去除无用特征都会进步精确率。
词袋模型中,二元分词、三元分词都是添加特征的办法,精确率得到了有效的提高,可是提取形容词的办法精确率反而降低,开始判断原因对错形容词中也包含了情感剖析的一部分重要特征,仅提取形容词的办法在去除部分无用特征时也去除了部分有用特征造成了精确率的降低。
去停用词、标点符号的办法是去除无用特征的重要办法,考虑到情感剖析中‘!’‘?’都具有必定的情感色彩将其保存。最终作用在减少了停用词和标点符号的特征后仅有形容词的特征精确率大幅下降。在正常的逻辑中这种情况不可能产生,由于停用词表和标点符号并不包含形容词。
计算词频的办法相当于在原有词袋上加入了词频特征,关于精确率的提高作用很强。
TF-IDF在计算词频的基础上加入了逆文档概率这一特征,正常来讲关于精确率提高是要好于词频计算的,在本次试验中作用并不明显,反而比词频计算作用更差。究其原因,很可能是TF-IDF特征与贝叶斯分类器的相适性并不好。
否定词处理相当于二元分词的一个子类,因而独自列出。可以看到处理作用比较于添加二元分词的作用要低出许多比较于传统分词作用也不是很好。