무회blog

python: 200601- 파이썬 워드클라우드 그리기(konlpy, nltk) 본문

Python

python: 200601- 파이썬 워드클라우드 그리기(konlpy, nltk)

최무회 2020. 6. 1. 20:08
## wordcloud 그리기 
# %pip install wordcloud
# %pip install nltk
# %pip install pandas
# %pip install numpy
# %pip install konlpy
##################################################################

import nltk
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
import pandas as pd 
from konlpy.tag import Hannanum
hannanum = Hannanum()
from wordcloud import WordCloud
from collections import Counter

f=open("..\\00.Data\\문재인대통령취임연설문_utf-8.txt",'r',encoding='utf-8')
lines = f.readlines()
f.close()

lines01 = 'Chief Justice Roberts, President Carter, President Clinton, President Bush, President Obama, fellow'
##################################################################

tokennizer = RegexpTokenizer('[\w]+')
stop_words = stopwords.words('english')  # C:\\Users\\C20A-018\\AppData\\Roaming\\nltk_data\\corpora\\stopwords\\
words = lines01.lower()
# words = str(lines[0:5])
tokens = tokennizer.tokenize(words)
stoped_tokens = [i for i in list((tokens)) if not i in stop_words]
stoped_tokens2 = [i for i in stoped_tokens if len(i) > 1]
pd.Series(stoped_tokens2).value_counts().head(10)
##################################################################
# lines
temp = []
for i in range(len(lines)):
    temp.append(hannanum.nouns(lines[i]))
    temp = list(filter(bool, temp))
    
def flatten(l):
    flatList = []
    for elem in l:
        if type(elem) == list:
            for e in elem:
                flatList.append(e)
        else:
            flatList.append(elem)
    return flatList

word_list = flatten(temp)
word_list=pd.Series([x for x in word_list if len(x)> 1])
word_list.value_counts().head(10)
    
# dir(word_list)
##################################################################
font_path = 'D://app_src/anaconda/06-font/나눔바른고딕/CJnXlA0w_D7iilTV5nZ2CsjiEBQ.ttf'
wordcloud = WordCloud(
    font_path = font_path,
    width=800,
    height=800,
    background_color="white"
)

count = Counter(stoped_tokens2)
wordcloud = wordcloud.generate_from_frequencies(count)

def __array__(self):
    return self.to_array()
    """Convert to numpy array. Returns
        image : nd-array size (width, height, 3), Word cloud image as numpy matrix."""
    
def to_array(self):
    return np.array(self.to_image())
    """Convert to numpy array. Returns 
       image : nd-array size(width, height,3), Word cloud image as numpy matrix."""
    
# array = wordcloud.to_array()

import matplotlib.pyplot as plt 

# fig = plt.figure(figsize=(10,10))
# plt.imshow(array,interpolation="bilinear")
# plt.show()
# fig.savefig('wordcloud.png')


count = Counter(word_list)
wordcloud = wordcloud.generate_from_frequencies(count)
array = wordcloud.to_array()

fig = plt.figure(figsize=(10,10))
plt.imshow(array, interpolation='bilinear')
plt.show()
fig.savefig('word_list_cloud.png')

문재인대통령취임연설문_utf-8.txt
0.01MB

Comments