250x250
Notice
Recent Posts
Recent Comments
일 | 월 | 화 | 수 | 목 | 금 | 토 |
---|---|---|---|---|---|---|
1 | 2 | 3 | 4 | 5 | ||
6 | 7 | 8 | 9 | 10 | 11 | 12 |
13 | 14 | 15 | 16 | 17 | 18 | 19 |
20 | 21 | 22 | 23 | 24 | 25 | 26 |
27 | 28 | 29 | 30 |
Tags
- word2vec
- 방식으로 텍스트
- Python
- 지마켓
- oracle
- 토픽추출
- (깃)git bash
- 네이버뉴스
- Topics
- 幼稚园杀手(유치원킬러)
- 게시판 만들기
- db
- 파이썬
- mysql
- jsp 파일 설정
- lda
- 크롤링
- test
- 과학백과사전
- r
- spring MVC(모델2)방식
- Websocket
- pytorch
- 이력서
- Gmarket
- 자바
- java
- tomoto
- RESFUL
- 코사인 유사도
Archives
- Today
- Total
무회blog
005.02_topikTs_LLDA 본문
In [1]:
# for_005_topikTs_tomoto
# %pip install tomotopy
# %pip install nltk
# 한국어 전처리
# %pip install --upgrade kiwipiepy
# %pip install KoNLP
# import nltk
# nltk.download()
from tomotopy import LLDAModel
import tomotopy as tp
import pandas as pd
import numpy as np
import nltk.stem, nltk.corpus, nltk.tokenize, re
from kiwipiepy import Kiwi
kiwi = Kiwi()
kiwi.prepare()
stemmer = nltk.stem.porter.PorterStemmer()
stopwords = set(nltk.corpus.stopwords.words('korean'))
# ######################## # # # 한국어 전처리
filepath = './testfile/문재인대통령취임연설문_ansi.txt'
def tokenize(sent):
res, score = kiwi.analyze(sent)[0] # 첫번째 결과를 사용
return [word
for word, tag, _, _ in res
if not tag.startswith('E')
and not tag.startswith('J')
and not tag.startswith('S')] # 조사, 어미, 특수기호는 제거
# tokenize 처리
dic01 = {}
token0 = []
li_model = []
class testLdas:
def startfunc(model):
for i, line in enumerate(open(filepath)):
token0 = tokenize(line)
stopwords = set([wd for wd in token0 if len(wd) <= 1]) # 한글자 단어는 불요어로 지정
stopwords = set('기자') # 한글자 단어는 불요어로 지정
token0 = [wd for wd in token0 if len(wd) > 1] # 한글자 이상 단어 토큰으로 지정
model.add_doc(token0) # tokenize함수를 이용해 전처리한 결과를 add_doc에 넣습니다.
model.train(tran_cnt)
for i in range(model.k):
ttx1= ', '.join(w for w, p in model.get_topic_words(i,top_n=top_n_cnt))
ttx1 = re.sub('[a-zA-Z@.]','',ttx1)
li_model.append(ttx1)
dic01['lda_model'] = li_model
k_cnt = 5 # 토픽의 개수 , 행 , 1 ~ 32767 사이의 정수
top_n_cnt = 7 # 토픽의 갯수 , 열
min_cf_cnt = 10 # 단어 최소 출현 빈도 , 0 일시 모든 단어를 동일하게 봄
alpha_cnt = 0.1 # 문헌‐토픽 빈도
eta_cnt = 0.01 # 토픽‐단어 빈도
tran_cnt = 100 # 자동학습 빈도
# (tw=TermWeight.ONE, min_cf=0, rm_top=0, k=1, smoothing_alpha=0.1, eta=0.01, seed=?)
model = tp.HDPModel(tw=tp.TermWeight.ONE, min_cf=3, rm_top=1) # 계층적 디리클레 프로세스 ( HDPModel )
# model = tp.CTModel(tw=tp.TermWeight.ONE, min_cf=3, rm_top=1) # 상관 토픽 모델 ( CTModel )
# model = tp.PAModel(tw=tp.TermWeight.ONE, min_cf=3, rm_top=1) # 파칭코 할당 ( PAMODel )
# model = tp.HPAModel(tw=tp.TermWeight.ONE, min_cf=3, rm_top=1) # 계층적 PA ( HPAModel )
# model = tp.MGLDAModel(tw=tp.TermWeight.ONE, min_cf=3, rm_top=1) # 멀티 그레인 LDA ( MGLDAModel )
# model = tp.HLDAModel(tw=tp.TermWeight.ONE, min_cf=3, rm_top=1) # 계층적 LDA ( HLDAModel )
# model = tp.LLDAModel(tw=tp.TermWeight.ONE, min_cf=3, rm_top=1) # 라벨 LDA ( LLDAModel )
# model = tp.PLDAModel(tw=tp.TermWeight.ONE, min_cf=3, rm_top=1) # 부분 라벨 LDA ( PLDAModel ) , 안됨
testLdas.startfunc(model)
df01 = pd.DataFrame(dic01)
df01
In [ ]:
# filepath2 = './testfile/문재인대통령취임연설문_ansi - 복사본.txt'
# # 라벨링하기
# # mdl = tp.HDPModel(tw=tp.TermWeight.ONE, min_cf=3, rm_top=5)
# def corpus_and_labeling_example(input_file):
# corpus = tp.utils.Corpus(tokenizer=tp.utils.SimpleTokenizer(), stopwords=['.'])
# # data_feeder yields a tuple of (raw string, user data) or a str (raw string)
# corpus.process(open(input_file, encoding='utf-8'))
# # make LDA model and train
# mdl = tp.LDAModel(k=20, min_cf=10, min_df=5, corpus=corpus)
# mdl.train(0)
# # print('Num docs:', len(mdl.docs), ', Vocab size:', len(mdl.used_vocabs), ', Num words:', mdl.num_words)
# # print('Removed top words:', mdl.removed_top_words)
# for i in range(0, 1000, 10):
# mdl.train(10)
# # print('Iteration: {}\tLog-likelihood: {}'.format(i, mdl.ll_per_word))
# # extract candidates for auto topic labeling
# extractor = tp.label.PMIExtractor(min_cf=10, min_df=5, max_len=5, max_cand=10000)
# cands = extractor.extract(mdl)
# labeler = tp.label.FoRelevance(mdl, cands, min_df=5, smoothing=1e-2, mu=0.25)
# for k in range(mdl.k):
# print("== Topic #{} ==".format(k))
# print("Labels:", ', '.join(label for label, score in labeler.get_topic_labels(k, top_n=5)))
# for word, prob in mdl.get_topic_words(k, top_n=10):
# print(word, prob, sep='\t')
# print()
# corpus_and_labeling_example(filepath2)
In [ ]:
# raw_corpus_and_labeling_example(filepath2)
# def raw_corpus_and_labeling_example(input_file):
# from nltk.stem.porter import PorterStemmer
# from nltk.corpus import stopwords
# stemmer = PorterStemmer()
# stopwords = set(stopwords.words('english'))
# corpus = tp.utils.Corpus(tokenizer=tp.utils.SimpleTokenizer(stemmer=stemmer.stem),
# stopwords=lambda x: len(x) <= 2 or x in stopwords)
# # data_feeder yields a tuple of (raw string, user data) or a str (raw string)
# corpus.process(open(input_file, encoding='utf-8'))
# # make LDA model and train
# mdl = tp.LDAModel(k=20, min_cf=10, min_df=5, corpus=corpus)
# mdl.train(0)
# # print('Num docs:', len(mdl.docs), ', Vocab size:', len(mdl.used_vocabs), ', Num words:', mdl.num_words)
# print('Removed top words:', mdl.removed_top_words)
# for i in range(0, 1000, 10):
# mdl.train(10)
# # print('Iteration: {}\tLog-likelihood: {}'.format(i, mdl.ll_per_word))
# # extract candidates for auto topic labeling
# extractor = tp.label.PMIExtractor(min_cf=10, min_df=5, max_len=5, max_cand=10000)
# cands = extractor.extract(mdl)
# labeler = tp.label.FoRelevance(mdl, cands, min_df=5, smoothing=1e-2, mu=0.25)
# for k in range(mdl.k):
# print("== Topic #{} ==".format(k))
# print("Labels:", ', '.join(label for label, score in labeler.get_topic_labels(k, top_n=5)))
# for word, prob in mdl.get_topic_words(k, top_n=10):
# print(word, prob, sep='\t')
# print()
'Python > mechineLearning' 카테고리의 다른 글
test002 from keras_bert import load_trained_model_from_checkpoint, Tokenizer (0) | 2020.08.06 |
---|---|
200806, test 중 , keras_bert (0) | 2020.08.06 |