250x250
Notice
Recent Posts
Recent Comments
일 | 월 | 화 | 수 | 목 | 금 | 토 |
---|---|---|---|---|---|---|
1 | 2 | |||||
3 | 4 | 5 | 6 | 7 | 8 | 9 |
10 | 11 | 12 | 13 | 14 | 15 | 16 |
17 | 18 | 19 | 20 | 21 | 22 | 23 |
24 | 25 | 26 | 27 | 28 | 29 | 30 |
Tags
- lda
- Websocket
- 파이썬
- r
- 크롤링
- jsp 파일 설정
- mysql
- word2vec
- 게시판 만들기
- 幼稚园杀手(유치원킬러)
- Gmarket
- Python
- RESFUL
- 자바
- 과학백과사전
- tomoto
- pytorch
- java
- Topics
- oracle
- 방식으로 텍스트
- (깃)git bash
- spring MVC(모델2)방식
- 네이버뉴스
- 토픽추출
- db
- test
- 코사인 유사도
- 지마켓
- 이력서
Archives
- Today
- Total
무회blog
200706-gensim-,gensim_summarize,okapi,gensim,bm25 본문
In [1]:
remarks = """
읽기경로: ./news002/quanbu/'
쓰기경로: ./yoyag_test/
----------------------------------------------------
exp: class Example_Class
, class 는 첫글자 대문자로 정의, 띄어쓰기 혹은 구분 단어를 _ 바로 정의
exp: def example_function():
, 함수는 첫글자 소문자로 정의, 띄어쓰기 혹은 구분 단어를 _ 바로 정의
exp: isVariable
, 변수명은 낙타기번 정의
----------------------------------------------------
토픽추출:
토픽 함수: Pmi_LdaTopic.get_pmiTopics
파라미터 :
get_cnt = 5 # 토픽의 갯수 , 빈도수에서 자동화 숫자
k_cnt = 1 # 토픽의 개수 , 행 , 1 ~ 32767 사이의 정수
top_n_cnt = get_cnt # 토픽의 갯수 , 열
min_cf_cnt = 1 # 단어 최소 출현 빈도 , 0 일시 모든 단어를 동일하게 봄
alpha_cnt = 0.1 # 문헌‐토픽 빈도
eta_cnt = 0.01 # 토픽‐단어 빈도
tran_cnt = 200 # 자동학습 빈도
rm_top2 = 1
----------------------------------------------------
gensim_summarize 방식:
gensim_summarize 함수:
Gensim_Summary.gensim_summary_wordCount(text)
Gensim_Summary.gensim_summary_ratio(text)
Gensim_Summary.gensim_summary_ratioNodetail(text)
gensim_summarize 파마리미터:
text (텍스트)
----------------------------------------------------
bm25 방식:
bm25 함수:
M25_Start.bm25_okapi(al_list,tg_num)
BM25_Start.gensim_bm25(al_list,tg_num)
bm25 파마리미터:
al_list (콘텐츠 전체 리스트)
,tg_num (타겟 내용)
----------------------------------------------------
테스트 파라미터:
chk = 0
subMenu = '' # 타겟 메뉴
sheet_cnt = 0 # 0 - 5 , 통합, 정치, 경제, 사회, 생활, 세계, it
most_cnt = 50 # 상위 교집합의 갯수 기준 테스트
# test_ratio = 0.5 # 30% 의 비율로 요약 요청
creatFolder_in = ''
td = '' # 시간 디폴트
text = '' # 글로벌 텍스트 디폴트 값
sentlineCnt = 3 # 3 줄 출력을 셋팅 # gensim_wordCound 방식 파라미터
"""
In [2]:
### 200703-summarize_test_gensim모듈화-002-02
import tomotopy as tp
import pandas as pd
import time, timeit, os, sys , re , math
from gensim.summarization import summarize
from collections import Counter
import collections
import operator
import nltk.stem, nltk.corpus, nltk.tokenize
from nltk import sent_tokenize, word_tokenize
from rank_bm25 import BM25Okapi
#############################################
from datetime import datetime
import numpy as np
import nltk.stem, nltk.corpus, nltk.tokenize
#############################################
td = datetime.today().strftime("%Y%m%d") # 오늘 일자
tdd = datetime.today().strftime("%m%d")
path = './news002/quanbu/'
#############################################
from gensim import corpora
from gensim import models
from konlpy.utils import pprint
#############################################
from string import punctuation
from heapq import nlargest
td = datetime.today().strftime("%Y%m%d")
tdd = datetime.today().strftime("%m%d")
now = datetime.now()
tdnow = now.strftime('%Y%m%d%H%M%S')
In [3]:
### from gensimBM25Source
import math
from six import iteritems
from six.moves import xrange
# BM25 parameters.
PARAM_K1 = 1.5
PARAM_B = 0.75
EPSILON = 0.25
class BM25(object):
def __init__(self, corpus):
self.corpus_size = len(corpus)
self.avgdl = sum(map(lambda x: float(len(x)), corpus)) / self.corpus_size
self.corpus = corpus
self.f = []
self.df = {}
self.idf = {}
self.initialize()
def initialize(self):
for document in self.corpus:
frequencies = {}
for word in document:
if word not in frequencies:
frequencies[word] = 0
frequencies[word] += 1
self.f.append(frequencies)
for word, freq in iteritems(frequencies):
if word not in self.df:
self.df[word] = 0
self.df[word] += 1
for word, freq in iteritems(self.df):
self.idf[word] = math.log(self.corpus_size - freq + 0.5) - math.log(freq + 0.5)
def get_score(self, document, index, average_idf):
score = 0
for word in document:
if word not in self.f[index]:
continue
idf = self.idf[word] if self.idf[word] >= 0 else EPSILON * average_idf
score += (idf * self.f[index][word] * (PARAM_K1 + 1)
/ (self.f[index][word] + PARAM_K1 * (1 - PARAM_B + PARAM_B * self.corpus_size / self.avgdl)))
return score
def get_scores(self, document, average_idf):
scores = []
for index in xrange(self.corpus_size):
score = self.get_score(document, index, average_idf)
scores.append(score)
return scores
### 가중치를 구해준다 .
def get_bm25_weights(corpus):
bm25 = BM25(corpus)
average_idf = sum(map(lambda k: float(bm25.idf[k]), bm25.idf.keys())) / len(bm25.idf.keys())
weights = []
for doc in corpus:
scores = bm25.get_scores(doc, average_idf)
weights.append(scores)
return weights
In [4]:
## default value
## Pmi_LdaTopic모델의 결과를 취득하다
class Pmi_LdaTopic:
# tokenize 처리
dic01 = {}
token0 = []
li_model_PMI = []
stemmer = nltk.stem.porter.PorterStemmer()
stopwords = set(nltk.corpus.stopwords.words('korean'))
# def fm_start():
def get_pmiTopics(tg_num):
df02 = pd.read_excel('./news002/quanbu/100_20200701084535_Replace_quanbu_nv.xlsx')
get_cnt = 5 # 토픽의 갯수 , 빈도수에서 자동화 숫자
k_cnt = 1 # 토픽의 개수 , 행 , 1 ~ 32767 사이의 정수
top_n_cnt = get_cnt # 토픽의 갯수 , 열
min_cf_cnt = 1 # 단어 최소 출현 빈도 , 0 일시 모든 단어를 동일하게 봄
alpha_cnt = 0.1 # 문헌‐토픽 빈도
eta_cnt = 0.01 # 토픽‐단어 빈도
tran_cnt = 200 # 자동학습 빈도
rm_top2 = 1
path_df = './topicTestFile/'
# tg_num = 0
model_PMI = tp.LDAModel(k=k_cnt, alpha=alpha_cnt,eta = eta_cnt, min_cf=min_cf_cnt,tw=tp.TermWeight.PMI)
al_list = df02['content'].tolist() # contents 내용을 읽어 드린다 .
all_list = [al_list[tg_num]] # 읽어드린 문서에서 타겟팅 내용
for i, line in enumerate(all_list): # 읽어드린 문서에서 타겟팅 내용
token0 = word_tokenize(line)
stopwords = set([wd for wd in token0 if len(wd) <= 1]) # 한글자 단어는 불용어로 지정
stopwords = set('기자') # 한글자 단어는 불요어로 지정
token0 = [wd for wd in token0 if len(wd) > 1] # 한글자 이상 단어 토큰으로 지정
model_PMI.add_doc(token0)
model_PMI.train(tran_cnt)
for i in range(model_PMI.k):
ttx2= ', '.join(w for w, p in model_PMI.get_topic_words(i, top_n=top_n_cnt))
ttx2 = re.sub('[a-zA-Z@.]','',ttx2)
Pmi_LdaTopic.li_model_PMI.append(ttx2)
Pmi_LdaTopic.dic01['lda_PMI'] = Pmi_LdaTopic.li_model_PMI
df_pmi = pd.DataFrame(Pmi_LdaTopic.dic01)
topic_list = df_pmi['lda_PMI'].tolist()
return topic_list
In [5]:
class Start_CreateReadFile:
### 원본 파일을 읽고 요약3줄을 추린다 .
# # step_000 시작입니다.
def start_createFile001():
path_dir1 = './yoyag_test/'
checkFolder = os.path.isdir(path_dir1)
creatFolder_in = path_dir1
if checkFolder == True:
pass
else:
os.mkdir(creatFolder_in)
############ 폴더 생성 및 체크
checkFolder = os.path.isdir('./yoyag_test/'+'20'+tdd + '/')
creatFolder_in = './yoyag_test/'+'20'+tdd + '/'
if checkFolder == True:
pass
else:
os.mkdir(creatFolder_in)
# 최종 읽기경로찾기 읽기 경로
path_dir = './news002/quanbu/'
path_list = os.listdir(path_dir)
## 폴더를 체크하고 만들기
checkFolder = os.path.isdir(path_dir)
creatFolder_in2 = path_dir
if checkFolder == True:
pass
else:
os.mkdir(creatFolder_in2)
## 폴더안에 파일을 읽어 들이기
path2 = path_dir + path_list[0]
file_path = path2
ddf_li = [pd.read_excel(file_path, sheet_name =x) for x in range(6)]
ddf = pd.concat(ddf_li)
## 데이터 프레임의 전처리 시작
ddf = ddf[['subMenu','title','content','yoyag']]
sample_list = []
ss = ddf['yoyag'].tolist()
for i in ss:
ts = i.replace('.','._').split('_')
ts = list(filter(bool, ts))
sample_list.append(ts)
sample_listcnt = [len(sample_list[x]) for x in range(len(sample_list))]
ddf['yoyag_cnt'] = sample_listcnt
ddf = ddf[ddf['yoyag_cnt'] >= 3][['subMenu','title','content','yoyag','yoyag_cnt']]
ddf = ddf[ddf['yoyag_cnt'] < 4][['subMenu','title','content','yoyag','yoyag_cnt']]
ddf_group = ddf.sort_values(by="subMenu",ascending = False).groupby('subMenu').head(17)
# ddf_group = ddf.sort_values(by="subMenu",ascending = False).groupby('subMenu')
ddf_group = ddf_group.head(100)
df2 = ddf_group.copy()
df2 = df2.rename(columns={'yoyag':'nv_summarize','yoyag_cnt':'nv_summarize_cnt'})
df2 = df2[['subMenu','title','content','nv_summarize','nv_summarize_cnt']]
df2.to_excel('./news002/quanbu/news_100.xlsx',sheet_name='통합')
return df2
# def start_replaceFile002(df2):
# # pd.set_option('mode.chained_ags_summarizeListignment', None) # <==== 경고를 끈다
# # ################################################### test001 요약단어_교집합 , gensim_단어요약 VS naver_단어요약
# contentList = df2['content'].tolist() # content 를 처리한다
# subMenu = df2.get('subMenu').tolist()
# li_summary_contnet = []
# chk = 0
# for i in contentList: # content 를 처리한다 , 요약까지 처리한다
# chk = chk + 1
# text = Cleaning_Text.text_cleaning(i)
# li_summary_contnet.append(text) # content 저장
# # gensim_summarize 로 3줄 요약한다.
# output = Gensim_Summary.gensim_summary_wordCount(text) # gs_ratio_summarize(output)
# output1 = Gensim_Summary.gensim_summary_ratio(text) # gs_ratio_summarize(output)
# output2 = Gensim_Summary.gensim_summary_ratioNodetail(text) # gs_ratio_summarize(output)
# li_summary.append(output)
# li_summary1.append(output1)
# li_summary2.append(output2)
# df2['gensim_summary_wordCount'] = pd.Series(li_summary)
# df2['gs_summarize_ratio'] = pd.Series(li_summary1)
# df2['gensim_summary_ratioNodetail'] = pd.Series(li_summary1)
# df2['content'] = pd.Series(li_summary_contnet)
# for i in df2['nv_summarize']:
# output = Cleaning_Text.text_cleaning(i)
# li_summary2.append(output)
# df2['nv_summarize'] = pd.Series(li_summary2)
# # gensim 요약 줄수 카운팅
# sample_list = []
# gs_summarizeList = df2['gs_summarize'].tolist()
# for i in gs_summarizeList:
# ts = i.replace('.','._').split('_')
# ts = list(filter(bool, ts))
# sample_list.append(ts)
# sample_listcnt = [len(sample_list[x]) for x in range(len(sample_list))]
# df2['gs_summarizeCnt'] = sample_listcnt
# # 단어_일치율 값을 비교하여 리턴한다.
# return df2
# ################################################################################################
# ## gensim 줄수 3으로 매칭
# # df2 = df2[df2['gs_summarizeCnt'] >= 3][['subMenu','title','content','nv_summarize_cnt','gs_summarizeCnt','단어매칭율','nv_summarize','gs_summarize']]
# # df2 = df2[df2['gs_summarizeCnt'] < 4 ][['subMenu','title','content','nv_summarize_cnt','gs_summarizeCnt','단어매칭율','nv_summarize','gs_summarize']]
# # df2 = df2.sort_values(by=['단어_빈도수'], axis = 0, ascending = False) ## ascending 정렬
# ################################################################################################
In [6]:
class Cleaning_Text:
def listToText(inputList):
returnText = ''
for i in inputList:
returnText = returnText + i
rt2 = Cleaning_Text.text_cleaning(returnText)
return rt2
## # step_4, 공통 코드 , 텍스트 클리닝
def text_cleaning(text):
################################## gensim 사용을 위한 정규표현식 200624
hangul_path9 = '[가-힣]+\.' # 한글로 포함되다 .
hangul_path0 = '[가-힣]+\.[가-힣]{1}' # 한글 . + 한글 처리
hangul_path1 = '[가-힣]+\.[\d]{1}' # 한글 . + 숫자 처리 [0-9]
hangul_path2 = '[가-힣]+\.[a-z]{1}' # 한글 . + 영어 소문자
hangul_path3 = '[가-힣]+\.[A-Z]{1}' # 한글 . + 영어 대문자
hangul_path4 = '[가-힣]+\.[\S]{1}' # 한글 . + 비공백 [^ \t\n\r\f\v]와 같다.
hangul_path5 = '[가-힣]+\.[\s]{1}' # 한글 . + 공백 [ \t\n\r\f\v]와 같다.
hangul_path6 = '[가-힣]+\.[\W]{1}' # 한글 . + 숫자 + 문자가 아닌 것 [^a-zA-Z0-9]와 같다.
hangul_path7 = '[가-힣]+\.[\w]{1}' # 한글 . + 숫자 + 문자 [a-zA-Z0-9]와 같다.
hangul_path8 = '[가-힣]+\.[\b]{1}' # 한글 . + 단어 경계 (`\w`와 `\W`의 경계)
reg_path = hangul_path0 + '|' + hangul_path1 + '|'+ hangul_path2 + '|'+ hangul_path3 + '|'+ hangul_path4+ '|'+ hangul_path5
hangul = re.compile(reg_path) # 한글 + 모모로 포함되다 .
result = hangul.findall(text) # 정규식에 일치되는 부분을 리스트 형태로 저장, 단어 반환
result = list(set(result))
for x in result:
text = text.replace(x, x[:-1] + '\n' + x[-1:])
### 줄단위 좌우 공백 지우기 ,
text = text.replace('\n','_').split('_')
text = [x.strip() for x in text]
tts = ''
for i in text:
tts = tts + i + '\n'
text = tts
################################## gensim 사용을 위한 정규표현식 200624
text = re.sub('\[.+?\]','', text) # 대괄호 [] 이내 모든 문자 삭제
#이모티콘 제거
EMOJI = re.compile('[\U00010000-\U0010ffff]', flags=re.UNICODE)
text= EMOJI.sub(r'', text)
#이메일 주소 제거
email =re.compile('([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)')
text = email.sub('', text)
#URL 제거
url =re.compile('(http|ftp|https)://(?:[-\w.]|(?:%[\da-fA-F]{2}))+')
text = url.sub('', text)
#HTML 제거
html =re.compile('<[^>]*>')
text = html.sub('', text)
#특수문자를 공백으로 대체(문장을 살리기위헤 마침표는 남겨둠)
#special =re.compile('[^\w\s]')
#text = special.sub(' ', text)
special= ['*', '{', ',', ':', ']', '$', '+', '[', '#', '(', '%', '&', '}', '`', '‘', '’','·',
'=', ';', '>','>', '/', '"', '“', '”', '\\', '?', '~', "'", '<', ')', '^', '!', '_',
'|', '@','@','©','ⓒ', '℗','®','①', '-','▶','…','☞','▲','◆','■'] #'.', 빼고
for chr in special :
text=text.replace(chr,' ')
#특수문자 제거 후 생기는 중복된 공백 제거
while text.find(' ') > 0:
text = text.replace(' ',' ' ) # 중복된 공백 제거
#특수문자 제거 후 생기는 중복된 개행 제거
while text.find('\n\n') > 0:
text = text.replace('\n\n','\n' ) # 중복된 개행 제거
#좌우측 공백 삭제
text = text.strip()
# 좌측 공백 삭제
# text.lstrip()
# 우측 공백 삭제
#text.rstrip()
return text
In [7]:
# point class : for summarization text and output is text summarization
# point param : text , sentlineCnt
class Gensim_Summary:
def gensim_summary_wordCount(text):
global sentlineCnt
# print('sentlineCnt , {}'.format(sentlineCnt))
totalWord = word_tokenize(text)
totalWord_cnt = len(totalWord)
# totalLine = 스플릿 해서 넣고 # text_cleaning
totalLine = text.replace('. ','. _').split('_') # 한글 + .(점) 기준 줄바꿈처리, 줄바꿈기준 짜르기
totalLineCnt = len(totalLine)
# 한줄 단어수 * 3 = 3 줄 단어 수
sentensWordCnt = (totalWord_cnt/totalLineCnt) * sentlineCnt
sentensWordCnt = int(sentensWordCnt)
output = summarize(text,word_count = sentensWordCnt) ## 정제된 데이터(content)로 문장을 요약한다.
return output # 요약된 글자들
def gensim_summary_ratio(text): # gs_ratio_summarize
# 라인수 세기
outputs = text.split('\n')
outputs = list(filter(bool,outputs))
TotalLine_cnt = len(outputs)
if TotalLine_cnt == 0:
TotalLine_cnt = 1
else:
TotalLine_cnt = TotalLine_cnt
# 라인수 기준 3줄 비율 측정
ratio_cnt = round(3/TotalLine_cnt,2)
if ratio_cnt < 0.1:
ratio_cnt = 0.1
elif ratio_cnt < 0.15:
ratio_cnt = 0.15
elif ratio_cnt < 0.2:
ratio_cnt = 0.2
elif ratio_cnt < 0.25:
ratio_cnt = 0.25
elif ratio_cnt < 0.3:
ratio_cnt = 0.3
elif ratio_cnt < 0.35:
ratio_cnt = 0.35
elif ratio_cnt < 0.4:
ratio_cnt = 0.4
elif ratio_cnt < 0.45:
ratio_cnt = 0.45
elif ratio_cnt < 0.5:
ratio_cnt = 0.5
elif ratio_cnt < 0.55:
ratio_cnt = 0.55
elif ratio_cnt < 0.6:
ratio_cnt = 0.6
elif ratio_cnt < 0.65:
ratio_cnt = 0.65
elif ratio_cnt < 0.7:
ratio_cnt = 0.7
elif ratio_cnt < 0.75:
ratio_cnt = 0.75
elif ratio_cnt < 0.8:
ratio_cnt = 0.8
elif ratio_cnt < 0.85:
ratio_cnt = 0.85
elif ratio_cnt < 0.9:
ratio_cnt = 0.9
else:
ratio_cnt = 1
output = summarize(text,ratio=ratio_cnt)
return output
def gensim_summary_ratioNodetail(text):
# print('start gensim_summary_ratioNodetail')
# print('-'*80)
# 라인수 세기
texts = text.split('\n')
texts = list(filter(bool,texts))
totalLineCnt = len(texts)
if totalLineCnt == 0:
totalLineCnt = 1
else:
totalLineCnt = totalLineCnt
# 라인수 기준 3줄 비율 측정
ratioCnt = round(sentlineCnt/totalLineCnt,2)
if ratioCnt < 0.1:
ratioCnt = 0.1
output = summarize(text,ratio=ratioCnt)
return output
In [8]:
class Compare_SummarizeText:
def compare_word(df2):
li_summary = []
li_summary1 = []
li_summary2 = []
li_summary3 = []
##################################################################################################
contentList = df2['content'].tolist() # content 를 처리한다
subMenu = df2.get('subMenu').tolist()
li_summary_contnet = []
chk = 0
for i in contentList: # content 를 처리한다 , 요약까지 처리한다
chk = chk + 1
text = Cleaning_Text.text_cleaning(i)
li_summary_contnet.append(text) # content 저장
# gensim_summarize 로 3줄 요약한다.
output1 = Gensim_Summary.gensim_summary_wordCount(text) # gs_ratio_summarize(output)
output2 = Gensim_Summary.gensim_summary_ratio(text) # gs_ratio_summarize(output)
output3 = Gensim_Summary.gensim_summary_ratioNodetail(text) # gs_ratio_summarize(output)
li_summary1.append(output1)
li_summary2.append(output2)
li_summary3.append(output3)
df2['gensim_wordCount'] = pd.Series(li_summary1)
df2['gs_summarize_ratio'] = pd.Series(li_summary2)
df2['gensim_summary_ratioNodetail'] = pd.Series(li_summary3)
df2['content'] = pd.Series(li_summary_contnet)
output = ''
for i in df2['nv_summarize']:
output = Cleaning_Text.text_cleaning(i)
li_summary.append(output)
df2['nv_summarize'] = pd.Series(li_summary)
################################################################################################################
# 단어_일치율 값을 비교하여 리턴한다.
tts_gensim_summary_wordCount = df2['gensim_wordCount'].tolist() # gensim value -> gensim
# tts_gs_summarize = df2['gs_summarize_ratio'].tolist() # gensim value -> gensim
tts_gs_summarize_ratio = df2['gs_summarize_ratio'].tolist() # gensim value -> gensim
tts_gensim_summary_ratioNodetail = df2['gensim_summary_ratioNodetail'].tolist() # gensim value -> gensim
df2 = BM25_Start.start_bm25(df2) # gensim_bm25 + okapi_bm25
tts1_nv_summarize = df2['nv_summarize'].tolist() # original value -> naverList
tts_gensim_bm25 = df2['gensim_bm25'].tolist()
tts_okapi_bm25 = df2['okapi_bm25'].tolist()
comMonList = []
comMonFreq = []
tokencntOriginal = []
tokencntGensim1 = []
tokencntGensim2 = []
tokencntGensim3 = []
tokencntBiyur1_1= []
tokencntBiyur1_2= []
tokencntBiyur1_3= []
tokencntBiyur2= []
tokencntBiyur3= []
tts_cnt = 0
for i in range(len(tts1_nv_summarize)):
tts_cnt = i
## nv_summarize 요약 내용을 처리한다 .
ttss1_naverSummaryText = ''
for i in tts1_nv_summarize[tts_cnt]:
ttss1_naverSummaryText = ttss1_naverSummaryText + i
# ## gs_summarize 요약 내용을 처리한다 .
# ttss = ''
# for i in tts[tts_cnt]:
# ttss = ttss + i
## gs_summarize 요약 내용을 처리한다 .
ttss_gensim_summary_wordCount = ''
for i in tts_gensim_summary_wordCount[tts_cnt]:
ttss_gensim_summary_wordCount = ttss_gensim_summary_wordCount + i
## gs_summarize 요약 내용을 처리한다 .
ttss_gs_summarize_ratio = ''
for i in tts_gs_summarize_ratio[tts_cnt]:
ttss_gs_summarize_ratio = ttss_gs_summarize_ratio + i
## gs_summarize 요약 내용을 처리한다 .
ttss_gensim_summary_ratioNodetail = ''
for i in tts_gensim_summary_ratioNodetail[tts_cnt]:
ttss_gensim_summary_ratioNodetail = ttss_gensim_summary_ratioNodetail + i
## tts_gensim_bm25 요약 내용을 처리한다 .
ttss_gensim_bm25 = ''
for i in tts_gensim_bm25[tts_cnt]:
ttss_gensim_bm25 = ttss_gensim_bm25 + i
## tts_okapi_bm25 요약 내용을 처리한다 .
ttss_okapi_bm25 = ''
for i in tts_okapi_bm25[tts_cnt]:
ttss_okapi_bm25 = ttss_okapi_bm25 + i
original_tokenList = word_tokenize(ttss1_naverSummaryText) ##### naver 단어 토큰화
original_tokenList = [x for x in original_tokenList if len(x) > 2]
# gensim_tokenList1 = word_tokenize(ttss) ##### gensim 단어 토큰화
# gensim_tokenList1 = [x for x in gensim_tokenList1 if len(x) > 2]
gensim_tokenList1 = word_tokenize(ttss_gensim_summary_wordCount) ##### gensim 단어 토큰화
gensim_tokenList1 = [x for x in gensim_tokenList1 if len(x) > 2]
gensim_tokenList2 = word_tokenize(ttss_gs_summarize_ratio) ##### gensim 단어 토큰화
gensim_tokenList2 = [x for x in gensim_tokenList2 if len(x) > 2]
gensim_tokenList3 = word_tokenize(ttss_gensim_summary_ratioNodetail) ##### gensim 단어 토큰화
gensim_tokenList3 = [x for x in gensim_tokenList3 if len(x) > 2]
gensimBM25_tokenList1 = word_tokenize(ttss_gensim_bm25) ##### gensim 단어 토큰화
gensimBM25_tokenList1 = [x for x in gensimBM25_tokenList1 if len(x) > 2]
okapiBM25_tokenList1 = word_tokenize(ttss_okapi_bm25) ##### gensim 단어 토큰화
okapiBM25_tokenList1 = [x for x in okapiBM25_tokenList1 if len(x) > 2]
_topFreqOriginal = Counter(original_tokenList) # a.most_common # 빈도수(frequency)가 높은 순으로 상위
# _topFreqGensim = Counter(gensim_tokenList1)
_topFreqGensim1 = Counter(gensim_tokenList1)
_topFreqGensim2 = Counter(gensim_tokenList2)
_topFreqGensim3 = Counter(gensim_tokenList3)
_topFreqGensimBM25 = Counter(gensimBM25_tokenList1)
_topFreqOkapi = Counter(okapiBM25_tokenList1)
### 교집합
# comMon1 = _topFreqOriginal & _topFreqGensim
comMon1_1 = _topFreqOriginal & _topFreqGensim1
comMon1_2 = _topFreqOriginal & _topFreqGensim2
comMon1_3 = _topFreqOriginal & _topFreqGensim3
comMon2 = _topFreqOriginal & _topFreqGensimBM25
comMon3 = _topFreqOriginal & _topFreqOkapi
# 교집합 빈도수
# comMon1 = comMon1.most_common(most_cnt) # _topFreqGensim
comMon1_1 = comMon1_1 .most_common(most_cnt) # _topFreqGensim
comMon1_2 = comMon1_2 .most_common(most_cnt) # _topFreqGensim
comMon1_3 = comMon1_3 .most_common(most_cnt) # _topFreqGensim
comMon2 = comMon2.most_common(most_cnt) # _topFreqGensimBM25
comMon3 = comMon3.most_common(most_cnt) # _topFreqOkapi
if len(original_tokenList) == 0 :
biyur = 0
else:
# # biyur1 = round(len(comMon1)/len(original_tokenList),2)*100
biyur1_1 = round(len(comMon1_1)/len(original_tokenList),2)*100
biyur1_2 = round(len(comMon1_2)/len(original_tokenList),2)*100
biyur1_3 = round(len(comMon1_3)/len(original_tokenList),2)*100
biyur2 = round(len(comMon2)/len(original_tokenList),2)*100
biyur3 = round(len(comMon3)/len(original_tokenList),2)*100
# # biyur1 = int(biyur1)
biyur1_1 = int(biyur1_1)
biyur1_2 = int(biyur1_2)
biyur1_3 = int(biyur1_3)
biyur2 = int(biyur2)
biyur3 = int(biyur3)
# biyur1 = str(biyur1) + '%' # 네이버요약 기준 , gensim 단어 매칭 비율
biyur1_1 = str(biyur1_1) + '%' # 네이버요약 기준 , gensim 단어 매칭 비율
biyur1_2 = str(biyur1_2) + '%' # 네이버요약 기준 , gensim 단어 매칭 비율
biyur1_3 = str(biyur1_3) + '%' # 네이버요약 기준 , gensim 단어 매칭 비율
biyur2 = str(biyur2) + '%' # 네이버요약 기준 , gensimBM25 단어 매칭 비율
biyur3 = str(biyur3) + '%' # 네이버요약 기준 , okapiBM25 단어 매칭 비율
## a | b # 합집합에 대한 내용
# tokencntOriginal.append(_topFreqOriginal.most_common(3)) # 상위 몇개 빼기
# tokencntGensim1.append(_topFreqGensim.most_common(3))
# comMonFreq.append(len(comMon1)) # 교집합 빈도수
# comMonList.append(comMon)
# tokencntBiyur1.append(biyur1) ## 단어일치율 # _topFreqGensim
tokencntBiyur1_1.append(biyur1_1) ## 단어일치율 # _topFreqGensim
tokencntBiyur1_2.append(biyur1_2) ## 단어일치율 # _topFreqGensim
tokencntBiyur1_3.append(biyur1_3) ## 단어일치율 # _topFreqGensim
tokencntBiyur2.append(biyur2) ## 단어일치율 # _topFreqGensimBM25
tokencntBiyur3.append(biyur3) ## 단어일치율 # _topFreqOkapi
# df2['요약단어_교집합'] = comMonList
# df2['단어_빈도수'] = comMonFreq
# df2['naver_단어빈도'] = tokencntOriginal
# df2['gensim_단어빈도'] = tokencntGensim1
# gensim 요약 줄수 카운팅
sample_list = []
gs_summarizeList = df2['gensim_wordCount'].tolist()
for i in gs_summarizeList:
ts = i.replace('.','._').split('_')
ts = list(filter(bool, ts))
sample_list.append(ts)
sample_listcnt = [len(sample_list[x]) for x in range(len(sample_list))]
df2['gensim_summary_wordCount_Cnt'] = sample_listcnt
# gensim 요약 줄수 카운팅
sample_list = []
gs_summarizeList = df2['gs_summarize_ratio'].tolist()
for i in gs_summarizeList:
ts = i.replace('.','._').split('_')
ts = list(filter(bool, ts))
sample_list.append(ts)
sample_listcnt = [len(sample_list[x]) for x in range(len(sample_list))]
df2['gs_summarize_ratio_Cnt'] = sample_listcnt
# gensim 요약 줄수 카운팅
sample_list = []
gs_summarizeList = df2['gensim_summary_ratioNodetail'].tolist()
for i in gs_summarizeList:
ts = i.replace('.','._').split('_')
ts = list(filter(bool, ts))
sample_list.append(ts)
sample_listcnt = [len(sample_list[x]) for x in range(len(sample_list))]
df2['gensim_summary_ratioNodetail_Cnt'] = sample_listcnt
sample_list1 = []
for i in tts_gensim_bm25:
ts = i.replace('.','._').split('_')
ts = list(filter(bool, ts))
sample_list1.append(ts)
sample_listcnt1 = [len(sample_list1[x]) for x in range(len(sample_list1))]
df2['gensim_bm25Cnt'] = sample_listcnt1
sample_list2 = []
for i in tts_gensim_bm25:
ts = i.replace('.','._').split('_')
ts = list(filter(bool, ts))
sample_list2.append(ts)
sample_listcnt2 = [len(sample_list2[x]) for x in range(len(sample_list2))]
df2['okapi_bm25Cnt'] = sample_listcnt2
df2['단어_일치율1_1_gsSummaryWord'] = tokencntBiyur1_1
df2['단어_일치율1_2_ratio'] = tokencntBiyur1_2
df2['단어_일치율1_3_ratioNd'] = tokencntBiyur1_3
df2['단어_일치율2_gsBM25'] = tokencntBiyur2
df2['단어_일치율3_okaBM25'] = tokencntBiyur3
# df2 = df2[['subMenu','title','content','nv_summarize_cnt','gs_summarizeCnt','gensim_bm25Cnt','okapi_bm25Cnt','단어_일치율1_gsSummary','단어_일치율2_gsBM25','단어_일치율3_okaBM25','nv_summarize','gs_summarize','gensim_bm25','okapi_bm25']]
df2 = df2[['subMenu','title','content'
,'nv_summarize_cnt'
,'gensim_summary_wordCount_Cnt','gs_summarize_ratio_Cnt','gensim_summary_ratioNodetail_Cnt'
,'gensim_bm25Cnt','okapi_bm25Cnt'
,'단어_일치율1_1_gsSummaryWord','단어_일치율1_2_ratio','단어_일치율1_3_ratioNd'
,'단어_일치율2_gsBM25','단어_일치율3_okaBM25'
,'nv_summarize'
,'gensim_wordCount','gs_summarize_ratio','gensim_summary_ratioNodetail'
,'gensim_bm25','okapi_bm25']]
return df2
In [9]:
class BM25_Start:
## 对目录下的所有文本进行预处理,构建字典
def gensim_bm25(al_list,tg_num):
topic_list = []
corpus = []
sentents = []
textList = []
# al_list = df02['content'].tolist() # 컨텐츠를 리스트화
for i in al_list:
text = i
texts = Cleaning_Text.text_cleaning(text)
texts = word_tokenize(texts)
texts = [x for x in texts if len(x)>2]
corpus.append(texts) # 단어 토큰화
tok_corpus = corpus
text = Cleaning_Text.text_cleaning(text)
# al_list = df02['content'].tolist()
text = sent_tokenize(al_list[tg_num])
text = list(filter(bool,text))
textList.append(text) # 문장 토큰화
topic_list = Pmi_LdaTopic.get_pmiTopics(tg_num) # 토픽을 추출한다 .
bm25Model = BM25(tok_corpus)
query = topic_list
average_idf = sum(map(lambda k: float(bm25Model.idf[k]), bm25Model.idf.keys())) / len(bm25Model.idf.keys())
scores = bm25Model.get_scores(query,average_idf)
idx = scores.index(max(scores)) # 0 번재에 가장 높은 값이 있음
bm25_summarize = textList[idx]
tts = ''
for i in bm25_summarize[:3]:
tts = tts + i
tts = Cleaning_Text.text_cleaning(tts)
sentents.append(tts)
return sentents
def bm25_okapi(al_list,tg_num):
# def bm25_okapi(tg_num):
sentents = []
topic_list = Pmi_LdaTopic.get_pmiTopics(tg_num) # 토픽을 추출한다 .
# al_list = df02['content'].tolist() # contents 내용을 읽어 드린다 .
corpus = sent_tokenize(al_list[tg_num]) # 특정 contents 내용을 코퍼스에 넣는다
tokenized_corpus = [doc.split(" ") for doc in corpus] # 읽어 드린 내용을 토큰화후 코퍼스에 넣는다.
bm25 = BM25Okapi(tokenized_corpus)
tokenized_query = topic_list # 쿼리
bm25_score = bm25.get_scores(tokenized_query) # 일치율 점수
bm25_summarize = bm25.get_top_n(tokenized_query, corpus, n=3) # 요약문장
tts = ''
for i in bm25_summarize:
tts = tts + i
tts = Cleaning_Text.text_cleaning(tts)
sentents.append(tts)
return sentents
def start_bm25(df02):
gsli = []
okali = []
Li_gensim_bm25 =[]
Li_okapi_bm25 = []
content_list = df02['content'].tolist()
for i,j in enumerate(content_list):
tg_nums = i
gsli = BM25_Start.gensim_bm25(content_list,tg_nums)
gensimText = Cleaning_Text.listToText(gsli)
Li_gensim_bm25.append(gensimText)
okali = BM25_Start.bm25_okapi(content_list,tg_nums)
okapiText = Cleaning_Text.listToText(okali)
Li_okapi_bm25.append(okapiText)
df02['gensim_bm25'] = Li_gensim_bm25
df02['okapi_bm25'] = Li_okapi_bm25
# df02 = compare_word2(df02)
return df02
In [10]:
## # step_001
##################################################################################################
chk = 0
subMenu = '' # 타겟 메뉴
sheet_cnt = 0 # 0 - 5 , 통합, 정치, 경제, 사회, 생활, 세계, it
most_cnt = 50 # 상위 교집합의 갯수 기준 테스트
# test_ratio = 0.5 # 30% 의 비율로 요약 요청
creatFolder_in = ''
td = ''
text = ''
sentlineCnt = 3 # 3 줄 출력을 셋팅
def start_fn():
# classNames
startCls = Start_CreateReadFile
compareCls = Compare_SummarizeText
# bm25stCls = BM25_Start
cleantxCls = Cleaning_Text
gensimsummCls = Gensim_Summary
df = startCls.start_createFile001() # input폴더 생성 , 읽기 및 리턴 ,df = zidongShengCheng()
# df = startCls.start_replaceFile002(concatDFList) # 읽은 파일 변형, 및 리턴 , df2 = gensim_Summarize2(df2)
df02 = df.copy()
# df02 = bm25stCls.start_bm25(df02) # gensim_bm25 + okapi_bm25
df02 = compareCls.compare_word(df02)
start_directory = r'D:\app_src\anaconda\04-srcTest\yoyag_test\bm25_summarize'
os.startfile(start_directory)
time.sleep(1)
df02['remarks'] = remarks
df02.to_excel('./yoyag_test/bm25_summarize/gs_sm_bm25.xlsx', sheet_name = 'bm25_gensmimSummarize')
return df02
In [11]:
if __name__ == "__main__":
start_now = int(time.time()) # 시작 시간 저장
start_fn() # gensim_summarize 비지니스로직 시작
ends = int(time.time()) - start_now
print('end_now, {} 초'.format(ends))
# path = './news002/quanbu/' # 파일넣고 시작하는 경로
##################################################################################################
'Python' 카테고리의 다른 글
001_topics_module-,tomato,ctm,lda,hannanums (0) | 2020.07.06 |
---|---|
001_,original_,options_,hdl_,topicModels-,0706(토픽 pmi,ctm 구하기) (0) | 2020.07.06 |
200701-summarize_test_gensim모듈화-002, gensim_summarize, gensim_bm25, okapi_bm25, completesrc,finishTest (0) | 2020.07.02 |
200701-summarize_test_gensim,모듈화,test,bm25test (0) | 2020.07.02 |
200701-yoyag_test_010, 요약뉴스처리,파이썬, 전처리 (0) | 2020.07.01 |
Comments