무회blog

005.02_topikTs_LLDA 본문

Python/mechineLearning

005.02_topikTs_LLDA

최무회 2020. 6. 8. 18:02

 

 

 

In [1]:
# for_005_topikTs_tomoto
# %pip install tomotopy
# %pip install nltk
#  한국어 전처리 
# %pip install --upgrade kiwipiepy  
# %pip install KoNLP
# import nltk
# nltk.download()
from tomotopy import LLDAModel
import tomotopy as tp 
import pandas as pd
import numpy as np
import nltk.stem, nltk.corpus, nltk.tokenize, re
from kiwipiepy import Kiwi
kiwi = Kiwi()
kiwi.prepare()
stemmer = nltk.stem.porter.PorterStemmer() 
stopwords = set(nltk.corpus.stopwords.words('korean')) 
# ######################## # # #  한국어 전처리 

filepath = './testfile/문재인대통령취임연설문_ansi.txt'

def tokenize(sent):
    res, score = kiwi.analyze(sent)[0] # 첫번째 결과를 사용
    return [word
            for word, tag, _, _ in res
            if not tag.startswith('E') 
            and not tag.startswith('J') 
            and not tag.startswith('S')] # 조사, 어미, 특수기호는 제거

# tokenize 처리 
dic01 = {}
token0 = []
li_model = [] 
class testLdas:
    def startfunc(model):
        for i, line in enumerate(open(filepath)):
            token0 = tokenize(line)
            stopwords = set([wd for wd in token0 if len(wd) <= 1]) #  한글자 단어는 불요어로 지정 
            stopwords = set('기자') #  한글자 단어는 불요어로 지정
            token0 = [wd for wd in token0 if len(wd) > 1]          # 한글자 이상 단어 토큰으로 지정 
            model.add_doc(token0)                                    # tokenize함수를 이용해 전처리한 결과를 add_doc에 넣습니다.
        model.train(tran_cnt)    
        for i in range(model.k):
            ttx1= ', '.join(w for w, p in model.get_topic_words(i,top_n=top_n_cnt))
            ttx1 = re.sub('[a-zA-Z@.]','',ttx1)
            li_model.append(ttx1)
            dic01['lda_model'] = li_model

k_cnt      = 5     # 토픽의 개수    , 행 , 1 ~ 32767 사이의 정수
top_n_cnt  = 7     # 토픽의  갯수   , 열
min_cf_cnt = 10    # 단어 최소 출현 빈도  , 0 일시 모든 단어를 동일하게 봄 
alpha_cnt  = 0.1   # 문헌‐토픽 빈도
eta_cnt    = 0.01  # 토픽‐단어 빈도
tran_cnt   = 100   # 자동학습 빈도

# (tw=TermWeight.ONE, min_cf=0, rm_top=0, k=1, smoothing_alpha=0.1, eta=0.01, seed=?)  
model = tp.HDPModel(tw=tp.TermWeight.ONE, min_cf=3, rm_top=1)  # 계층적 디리클레 프로세스 ( HDPModel )
# model = tp.CTModel(tw=tp.TermWeight.ONE, min_cf=3, rm_top=1)   # 상관 토픽 모델 ( CTModel )
# model = tp.PAModel(tw=tp.TermWeight.ONE, min_cf=3, rm_top=1)   # 파칭코 할당 ( PAMODel )
# model = tp.HPAModel(tw=tp.TermWeight.ONE, min_cf=3, rm_top=1)   # 계층적 PA ( HPAModel )
# model = tp.MGLDAModel(tw=tp.TermWeight.ONE, min_cf=3, rm_top=1)   # 멀티 그레인 LDA ( MGLDAModel )
# model = tp.HLDAModel(tw=tp.TermWeight.ONE, min_cf=3, rm_top=1)   # 계층적 LDA ( HLDAModel )
# model = tp.LLDAModel(tw=tp.TermWeight.ONE, min_cf=3, rm_top=1)   # 라벨 LDA ( LLDAModel )
# model = tp.PLDAModel(tw=tp.TermWeight.ONE, min_cf=3, rm_top=1)   # 부분 라벨 LDA ( PLDAModel ) , 안됨 

testLdas.startfunc(model)

df01 = pd.DataFrame(dic01)
df01
 
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-1-fc1646417711> in <module>
     45 # model = tp.HLDAModel(tw=tp.TermWeight.ONE, min_cf=3, rm_top=1)   # 계층적 LDA ( HLDAModel )
     46 # model = tp.LLDAModel(tw=tp.TermWeight.ONE, min_cf=3, rm_top=1)   # 라벨 LDA ( LLDAModel )
---> 47 model = tp.PLDAModel(tw=tp.TermWeight.ONE, min_cf=3, rm_top=1)   # 부분 라벨 LDA ( PLDAModel )
     48 
     49 

NameError: name 'tp' is not defined
In [ ]:
# filepath2 = './testfile/문재인대통령취임연설문_ansi - 복사본.txt'

# # 라벨링하기 
# # mdl = tp.HDPModel(tw=tp.TermWeight.ONE, min_cf=3, rm_top=5)
# def corpus_and_labeling_example(input_file):
#     corpus = tp.utils.Corpus(tokenizer=tp.utils.SimpleTokenizer(), stopwords=['.'])
#     # data_feeder yields a tuple of (raw string, user data) or a str (raw string)
#     corpus.process(open(input_file, encoding='utf-8'))

#     # make LDA model and train
#     mdl = tp.LDAModel(k=20, min_cf=10, min_df=5, corpus=corpus)
#     mdl.train(0)
# #     print('Num docs:', len(mdl.docs), ', Vocab size:', len(mdl.used_vocabs), ', Num words:', mdl.num_words)
# #     print('Removed top words:', mdl.removed_top_words)
#     for i in range(0, 1000, 10):
#         mdl.train(10)
# #         print('Iteration: {}\tLog-likelihood: {}'.format(i, mdl.ll_per_word))
    
#     # extract candidates for auto topic labeling
#     extractor = tp.label.PMIExtractor(min_cf=10, min_df=5, max_len=5, max_cand=10000)
#     cands = extractor.extract(mdl)

#     labeler = tp.label.FoRelevance(mdl, cands, min_df=5, smoothing=1e-2, mu=0.25)
#     for k in range(mdl.k):
#         print("== Topic #{} ==".format(k))
#         print("Labels:", ', '.join(label for label, score in labeler.get_topic_labels(k, top_n=5)))
#         for word, prob in mdl.get_topic_words(k, top_n=10):
#             print(word, prob, sep='\t')
#         print()
        
# corpus_and_labeling_example(filepath2)        
In [ ]:
# raw_corpus_and_labeling_example(filepath2)
# def raw_corpus_and_labeling_example(input_file):
#     from nltk.stem.porter import PorterStemmer
#     from nltk.corpus import stopwords
#     stemmer = PorterStemmer()
#     stopwords = set(stopwords.words('english'))
#     corpus = tp.utils.Corpus(tokenizer=tp.utils.SimpleTokenizer(stemmer=stemmer.stem), 
#         stopwords=lambda x: len(x) <= 2 or x in stopwords)
#     # data_feeder yields a tuple of (raw string, user data) or a str (raw string)
#     corpus.process(open(input_file, encoding='utf-8'))

#     # make LDA model and train
#     mdl = tp.LDAModel(k=20, min_cf=10, min_df=5, corpus=corpus)
#     mdl.train(0)
# #     print('Num docs:', len(mdl.docs), ', Vocab size:', len(mdl.used_vocabs), ', Num words:', mdl.num_words)
#     print('Removed top words:', mdl.removed_top_words)
#     for i in range(0, 1000, 10):
#         mdl.train(10)
# #         print('Iteration: {}\tLog-likelihood: {}'.format(i, mdl.ll_per_word))
    
#     # extract candidates for auto topic labeling
#     extractor = tp.label.PMIExtractor(min_cf=10, min_df=5, max_len=5, max_cand=10000)
#     cands = extractor.extract(mdl)

#     labeler = tp.label.FoRelevance(mdl, cands, min_df=5, smoothing=1e-2, mu=0.25)
#     for k in range(mdl.k):
#         print("== Topic #{} ==".format(k))
#         print("Labels:", ', '.join(label for label, score in labeler.get_topic_labels(k, top_n=5)))
#         for word, prob in mdl.get_topic_words(k, top_n=10):
#             print(word, prob, sep='\t')
#         print()
Comments