무회blog

200601-군집분석 ,k-평균군집 본문

Python

200601-군집분석 ,k-평균군집

최무회 2020. 6. 1. 20:09
# Part2 qunji jisuan 
# %pip install sklearn
# k평균 군집합 

import pandas as pd
from konlpy.tag import Hannanum
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans

from sklearn.decomposition import PCA
import matplotlib.pyplot as plt 
import re


hannanum = Hannanum()

Data = pd.read_csv('./newsTest/test001.csv',engine="python")


docs = []
for i in Data['data']:
    docs.append(hannanum.nouns(i))
    
docs = [x for x in docs if len(x)> 1 ]

for i in range(len(docs)):
    docs[i] = ' '.join(docs[i])
    
vec = CountVectorizer()
X= vec.fit_transform(docs)
df = pd.DataFrame(X.toarray(), columns=vec.get_feature_names())
kmeans = KMeans(n_clusters=3).fit(df)

pca = PCA(n_components=2)
principalComponents = pca.fit_transform(df)
principalComponents
principalDf = pd.DataFrame(data = principalComponents, columns=['principal component 1','principal component 2'])
principalDf.index=Data['search']
plt.scatter(principalDf.iloc[kmeans.labels_ == 0, 0], principalDf.iloc[kmeans.labels_ ==0,1], s = 10, c='red',label='cluster1')
plt.scatter(principalDf.iloc[kmeans.labels_ == 1, 0], principalDf.iloc[kmeans.labels_ ==1,1], s = 10, c='blue',label='cluster2')
plt.scatter(principalDf.iloc[kmeans.labels_ == 2, 0], principalDf.iloc[kmeans.labels_ ==2,1], s = 10, c='green',label='cluster3')
plt.legend()

 

Comments