200623-yoyag_test_008-002, 요약테스트 본문
In [1]:
import pandas as pd
import time, timeit, os, sys , re
from gensim.summarization import summarize
from collections import Counter
import collections
from nltk.tokenize import sent_tokenize,word_tokenize
from datetime import datetime
import numpy as np
import nltk.stem, nltk.corpus, nltk.tokenize
from newspaper import Article
import tomotopy as tp
from gensim import corpora
from gensim import models
from konlpy.utils import pprint
from kiwipiepy import Kiwi
from konlpy.tag import Hannanum
hannanum = Hannanum()
kiwi = Kiwi()
import requests as rq
import selenium as se
from selenium import webdriver
from bs4 import BeautifulSoup
from multiprocessing import Pool
from string import punctuation
from heapq import nlargest
In [2]:
def text_cleaning(text):
#이모티콘 제거
EMOJI = re.compile('[\U00010000-\U0010ffff]', flags=re.UNICODE)
text= EMOJI.sub(r'', text)
#이메일 주소 제거
email =re.compile('([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)')
text = email.sub('', text)
#URL 제거
url =re.compile('(http|ftp|https)://(?:[-\w.]|(?:%[\da-fA-F]{2}))+')
text = url.sub('', text)
#HTML 제거
html =re.compile('<[^>]*>')
text = html.sub('', text)
#특수문자를 공백으로 대체(문장을 살리기위헤 마침표는 남겨둠)
#special =re.compile('[^\w\s]')
#text = special.sub(' ', text)
special= ['*', '{', ',', ':', ']', '$', '+', '[', '#', '(', '%', '&', '}', '`', '‘', '’','·',
'=', ';', '>','>', '/', '"', '“', '”', '\\', '?', '~', "'", '<', ')', '^', '!', '_',
'|', '@','@','©','ⓒ', '℗','®','①', '-','▶','…','☞','▲'] #'.', 빼고
for chr in special :
text=text.replace(chr,' ')
#특수문자 제거 후 생기는 중복된 공백 제거
while text.find(' ') > 0:
text = text.replace(' ',' ' ) # 중복된 공백 제거
#특수문자 제거 후 생기는 중복된 개행 제거
while text.find('\n\n') > 0:
text = text.replace('\n\n','\n' ) # 중복된 개행 제거
# .텍스트 -> ".텍스트" -> ". 텍스트"
#좌우측 공백 삭제
# 좌측 공백 삭제
# text.lstrip()
# 우측 공백 삭제
return text
In [3]:
def textSummerFunc():
# punctuation는 [, ], ? 등 기호 리스트 이다.
STOPWORDS = list(punctuation)
#문장에 나타나는 단어의 빈도 중 최소, 최대등 의미없는 단어를 제거하기 위한 변수
def compute_word_frequencies(word_sentences):
words = [word for sentence in word_sentences
for word in sentence
if word not in STOPWORDS]
counter = Counter(words)
limit = float(max(counter.values()))
word_frequencies = {word: freq/limit
for word,freq in counter.items()}
# Drop words if too common or too uncommon
word_frequencies = {word: freq
for word,freq in word_frequencies.items()
if freq > MIN_WORD_PROP
and freq < MAX_WORD_PROP}
return word_frequencies
def sentence_score(word_sentence, word_frequencies):
return sum([ word_frequencies.get(word,0)
for word in word_sentence])
def summarize(text:str, num_sentences=3):
Summarize the text, by return the most relevant sentences
:text the text to summarize
:num_sentences the number of sentences to return
# Make the text lowercase
text = text.lower()
# Break text into sentences
sentences = sent_tokenize(text)
# Break sentences into words
word_sentences = [word_tokenize(sentence)
for sentence in sentences]
# Compute the word frequencies
word_frequencies = compute_word_frequencies(word_sentences)
# Calculate the scores for each of the sentences
scores = [sentence_score(word_sentence, word_frequencies)
for word_sentence in word_sentences]
sentence_scores = list(zip(sentences, scores))
# Rank the sentences
top_sentence_scores = nlargest(num_sentences,
key=lambda t: t[1])
# Return the top sentences
return [t[0] for t in top_sentence_scores]
#분류 대상 파일을 읽어온다
ReadDataPd = df2.copy()
# for w in ReadDataPd['내용']:
for w in ReadDataPd['content']:
ReadDoc_CleanText= text_cleaning(w)
# n줄로 요약 생성
sum_res=summarize(ReadDoc_CleanText, num_sentences=3)
# path_to = creatFolder_in + td + '_'+ subMenu.strip() + '_' + test_bindo +'_'+ str(most_cnt) + '_textSummerFunc'+'.xlsx'
# ReadDataPd.to_excel(path_to)
return ReadDataPd
# textSummerFunc()
In [4]:
def gensim_wordSummarize(tts,tts1):
aa = []
bb = []
cc = []
li_tts = []
liwd_cnt = []
tts_cnt = 0
for i in range(len(tts)):
tts_cnt = i
ttss = ''
for i in tts[tts_cnt]:
ttss = ttss + i
ttss1 = ''
for i in tts1[tts_cnt]:
ttss1 = ttss1 + i
liwd = word_tokenize(ttss) ##### gensim 단어 토큰화
liwd = [x for x in liwd if len(x) > 2]
liwd1 = word_tokenize(ttss1) ##### naver 단어 토큰화
liwd1 = [x for x in liwd1 if len(x) > 2]
a = Counter(liwd) # a.most_common # 빈도수(frequency)가 높은 순으로 상위
b = Counter(liwd1)
c = a & b # 교집합
c = c.most_common(most_cnt) # 교집합 빈도수
if len(liwd) == 0 :
biyur = 0
biyur = round(len(c)/len(liwd),2)*100
biyur = int(biyur)
biyur = str(biyur) + '%' # 네이버요약 기준 , gensim 단어 매칭 비율
## a | b # 합집합에 대한 내용
aa.append(a.most_common(3)) # 상위 몇개 빼기
cc.append(len(c)) # 교집합 빈도수
# df2['요약_단어빈도교집합'] = li_tts
df2['요약단어_교집합'] = li_tts
df2['naver_단어빈도'] = aa
df2['gensim_단어빈도'] = bb
df2['단어_빈도수'] = cc
df2['빈도_비율'] = liwd_cnt
df3 = df2.copy()
df3['단어매칭율'] = pd.to_numeric(df3['빈도_비율'].str.replace('%',''))
df3['단어매칭율'] = str(df3['단어매칭율'].mean()) + '%'
# df3 = df3[['subMenu','title','yoyag','naver_단어빈도','summarize_word','txdf_단어빈도','요약단어_교집합','빈도_비율','단어_빈도수','단어매칭율']]
df3 = df3[['subMenu','title','yoyag','naver_단어빈도','summary_yoyag','gensim_단어빈도','요약단어_교집합','빈도_비율','단어_빈도수','단어매칭율']]
for i in range(len(df3['요약단어_교집합'])):
if len(df3['요약단어_교집합'][i]) == 0 :
df3['요약단어_교집합'][i] = '무'
# df3 = df3.sort_values(by=['단어_빈도수'], axis = 0, ascending = False) ## ascending 정렬 교집합
# df3 = df3.sort_values(by=['빈도_비율'], axis = 0, ascending = False) ## ascending 정렬 빈도율
path_to = creatFolder_in + td + '_'+ subMenu.strip() + '_' + test_bindo +'_'+ str(most_cnt)+'_gensim_wordSummarize' +'.xlsx'
return df3
In [5]:
def summary_wordSummarize(tts,tts1):
aa = []
bb = []
cc = []
li_tts = []
liwd_cnt = []
tts_cnt = 0
for i in range(len(tts)):
tts_cnt = i
ttss = ''
for i in tts[tts_cnt]:
ttss = ttss + i
ttss1 = ''
for i in tts1[tts_cnt]:
ttss1 = ttss1 + i
liwd = word_tokenize(ttss) ##### gensim 단어 토큰화
liwd = [x for x in liwd if len(x) > 2]
liwd1 = word_tokenize(ttss1) ##### naver 단어 토큰화
liwd1 = [x for x in liwd1 if len(x) > 2]
a = Counter(liwd) # a.most_common # 빈도수(frequency)가 높은 순으로 상위
b = Counter(liwd1)
c = a & b # 교집합
c = c.most_common(most_cnt) # 교집합 빈도수
if len(liwd) == 0 :
biyur = 0
biyur = round(len(c)/len(liwd),2)*100
biyur = int(biyur)
biyur = str(biyur) + '%' # 네이버요약 기준 , gensim 단어 매칭 비율
## a | b # 합집합에 대한 내용
aa.append(a.most_common(3)) # 상위 몇개 빼기
cc.append(len(c)) # 교집합 빈도수
# df2['요약_단어빈도교집합'] = li_tts
txdf['요약단어_교집합'] = li_tts
txdf['naver_단어빈도'] = aa
txdf['txdf_단어빈도'] = bb
txdf['단어_빈도수'] = cc
txdf['빈도_비율'] = liwd_cnt
df3 = txdf.copy()
df3['단어매칭율'] = pd.to_numeric(df3['빈도_비율'].str.replace('%',''))
df3['단어매칭율'] = str(df3['단어매칭율'].mean()) + '%'
df3 = df3[['subMenu','title','yoyag','naver_단어빈도','summarize_word','txdf_단어빈도','요약단어_교집합','빈도_비율','단어_빈도수','단어매칭율']]
for i in range(len(df3['요약단어_교집합'])):
if len(df3['요약단어_교집합'][i]) == 0 :
df3['요약단어_교집합'][i] = '무'
# df3 = df3.sort_values(by=['단어_빈도수'], axis = 0, ascending = False) ## ascending 정렬 교집합
# df3 = df3.sort_values(by=['빈도_비율'], axis = 0, ascending = False) ## ascending 정렬 빈도율
path_to = creatFolder_in + td + '_'+ subMenu.strip() + '_' + test_bindo +'_'+ str(most_cnt)+'_summary_wordSummarize' +'.xlsx'
return df3
In [6]:
path = './yoyag_test/'
checkFolder = os.path.isdir(path)
creatFolder_in = path
if checkFolder == True:
############ 폴더 생성 및 체크
tdd = datetime.today().strftime("%m%d")
checkFolder = os.path.isdir('./yoyag_test/'+'20'+tdd + '/')
creatFolder_in = './yoyag_test/'+'20'+tdd + '/'
if checkFolder == True:
bindo_li = {}
td = datetime.today().strftime("%Y%m%d") # 오늘 일자
path_dir = './news002/quanbu/'
path_list = os.listdir(path_dir)
path = path_dir + path_list[0]
In [7]:
subMenu = '' # 타겟 메뉴
sheet_cnt = 1 # 0 - 5 , 정치, 경제, 사회, 생활, 세계, it
most_cnt = 20 # 상위 교집합의 갯수 기준 테스트
test_bindo = '빈도_비율' # 빈도_비율, # 단어_빈도수 ,(교집합 테스트)
# test_bindo = '단어_빈도수' # 빈도_비율, # 단어_빈도수 ,(교집합 테스트)
# test_ratio = 0.5 # 30% 의 비율로 요약 요청
test_wordCnt = 25 # 단어수 20개로 지정
df = pd.read_excel(path, sheet_name=sheet_cnt)
df1 = df.copy()
df1['content'] = df1['content'].str.replace('◆|ⓒ','').str.strip()
df2 = df1[['subMenu','title','content','yoyag']]
li_summary = []
chk = 0
pd.set_option('mode.chained_assignment', None) # <==== 경고를 끈다
# ################################################### test001 요약단어_교집합 , gensim_단어요약 VS naver_단어요약
cnt_li = df2['content'].tolist()
subMenu = df2['subMenu'][0]
for i in cnt_li:
tts = text_cleaning(i)
chk = chk + 1
# print(chk)
# TotalWord = 단어수 세서 넣고
# TotalLine = 스플릿 해서 넣고
# 문장당 단어수 = (TotalWord/TotalLine) * 3.2
output1 = tts.replace('.','. ')
# output = summarize(output1,ratio=test_ratio) ## test_ratio
# output = summarize(output,word_count =test_wordCnt) ## test_wordCnt
output = summarize(output1,word_count =test_wordCnt) ## test_wordCnt
output = output.replace('다.','다._').split('_')
df2['summary_yoyag'] = pd.Series(li_summary)
tts = df2['summary_yoyag'].tolist()
tts1 = df2['yoyag'].tolist()
####################################################################### test002 요약단어_교집합 , summary_단어요약 VS naver_단어요약
txdf = textSummerFunc() # summarrize 함수를 사용
tts0 = txdf['summarize_word'].tolist()
tts01 = txdf['yoyag'].tolist()
subMenu = txdf['subMenu'][0]
# summary_wordSummarize(tts0,tts01)
In [8]:
# subMenu = '' # 타겟 메뉴
# sheet_cnt = 1 # 0 - 5 , 정치, 경제, 사회, 생활, 세계, it
# most_cnt = 20 # 상위 교집합의 갯수 기준 테스트
# test_bindo = '빈도_비율' # 빈도_비율, # 단어_빈도수 ,(교집합 테스트)
# # test_ratio = 0.5 # 30% 의 비율로 요약 요청
# test_wordCnt = 25 # 단어수 20개로 지정
# df = pd.read_excel(path, sheet_name=sheet_cnt)
# df1 = df.copy()
# df1 = df1.drop_duplicates('content') #### 중복된 행 제거
# print(len(df1))
# df1['content'][0]
# # df1['content'] = df1['content'].str.replace('◆|ⓒ','').str.strip()
# # df2 = df1[['subMenu','title','content','yoyag']]
# # df2
In [ ]:
