用python统计水浒传中的高频词汇

用python统计水浒传中的高频词汇
#词频统计.py
import jieba
excludes = {“两个”,”一个”,”只见”,”如何”,”那里”,”哥哥”,”说道”,”军马”,”头领”,”众人”,”这里”,”兄弟”,”梁山泊”,”出来”,”小人”,”今日”,”这个”,”先锋”,”三个”,”因此”,”人马”,”问道”,”起来”,”便是”,”妇人”,”好汉”,”不是”,”不知”,”不曾”,”只是”,”如此”,”次日”,”我们”,”不得”,”如今”,”看时”,”不敢”,”来到”,”且说”,”一面”,”只得”,”山寨”,”原来”,”将军”,”却是”}
txt = open(“水浒传.txt”,”r”,encoding=’gb18030′, errors=’ignore’).read()
words = jieba.lcut(txt)
counts = {}
for word in words:
if len(word) == 1:
continue
elif word == “宋江道”or word == “宋江”:
rword = “宋江”
else:
rword = word

counts[rword] = counts.get(rword,0) + 1
for word in excludes:
del(counts[word])
items = list(counts.items())
items.sort(key=lambda x:x[1],reverse=True)
print(“小说《水浒传》中有意义的前十高频词汇为:”)
for i in range(10):
word,count = items[i]
print(“{0:<10}{1:>5}”.format(word,count))

%title插图%num