250x250
Notice
Recent Posts
Recent Comments
일 | 월 | 화 | 수 | 목 | 금 | 토 |
---|---|---|---|---|---|---|
1 | 2 | |||||
3 | 4 | 5 | 6 | 7 | 8 | 9 |
10 | 11 | 12 | 13 | 14 | 15 | 16 |
17 | 18 | 19 | 20 | 21 | 22 | 23 |
24 | 25 | 26 | 27 | 28 | 29 | 30 |
Tags
- java
- RESFUL
- 게시판 만들기
- 파이썬
- mysql
- Python
- jsp 파일 설정
- 코사인 유사도
- 방식으로 텍스트
- pytorch
- db
- oracle
- 자바
- 幼稚园杀手(유치원킬러)
- lda
- 지마켓
- 크롤링
- 토픽추출
- Websocket
- 이력서
- tomoto
- 과학백과사전
- (깃)git bash
- spring MVC(모델2)방식
- Topics
- Gmarket
- 네이버뉴스
- r
- test
- word2vec
Archives
- Today
- Total
무회blog
200623-yoyag_test_008-002, 요약테스트 본문
In [1]:
import pandas as pd
import time, timeit, os, sys , re
from gensim.summarization import summarize
from collections import Counter
import collections
from nltk.tokenize import sent_tokenize,word_tokenize
#############################################
from datetime import datetime
import numpy as np
import nltk.stem, nltk.corpus, nltk.tokenize
from newspaper import Article
#############################################
import tomotopy as tp
#############################################
from gensim import corpora
from gensim import models
from konlpy.utils import pprint
from kiwipiepy import Kiwi
from konlpy.tag import Hannanum
hannanum = Hannanum()
kiwi = Kiwi()
kiwi.prepare()
#############################################
import requests as rq
import selenium as se
from selenium import webdriver
from bs4 import BeautifulSoup
from multiprocessing import Pool
#############################################
from string import punctuation
from heapq import nlargest
In [2]:
def text_cleaning(text):
#이모티콘 제거
EMOJI = re.compile('[\U00010000-\U0010ffff]', flags=re.UNICODE)
text= EMOJI.sub(r'', text)
#이메일 주소 제거
email =re.compile('([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)')
text = email.sub('', text)
#URL 제거
url =re.compile('(http|ftp|https)://(?:[-\w.]|(?:%[\da-fA-F]{2}))+')
text = url.sub('', text)
#HTML 제거
html =re.compile('<[^>]*>')
text = html.sub('', text)
#특수문자를 공백으로 대체(문장을 살리기위헤 마침표는 남겨둠)
#special =re.compile('[^\w\s]')
#text = special.sub(' ', text)
special= ['*', '{', ',', ':', ']', '$', '+', '[', '#', '(', '%', '&', '}', '`', '‘', '’','·',
'=', ';', '>','>', '/', '"', '“', '”', '\\', '?', '~', "'", '<', ')', '^', '!', '_',
'|', '@','@','©','ⓒ', '℗','®','①', '-','▶','…','☞','▲'] #'.', 빼고
for chr in special :
text=text.replace(chr,' ')
#특수문자 제거 후 생기는 중복된 공백 제거
while text.find(' ') > 0:
text = text.replace(' ',' ' ) # 중복된 공백 제거
#특수문자 제거 후 생기는 중복된 개행 제거
while text.find('\n\n') > 0:
text = text.replace('\n\n','\n' ) # 중복된 개행 제거
# .텍스트 -> ".텍스트" -> ". 텍스트"
#좌우측 공백 삭제
text.strip()
# 좌측 공백 삭제
# text.lstrip()
# 우측 공백 삭제
#text.rstrip()
return text
In [3]:
def textSummerFunc():
# punctuation는 [, ], ? 등 기호 리스트 이다.
STOPWORDS = list(punctuation)
#문장에 나타나는 단어의 빈도 중 최소, 최대등 의미없는 단어를 제거하기 위한 변수
MIN_WORD_PROP, MAX_WORD_PROP = 0.1, 0.9
def compute_word_frequencies(word_sentences):
words = [word for sentence in word_sentences
for word in sentence
if word not in STOPWORDS]
counter = Counter(words)
limit = float(max(counter.values()))
word_frequencies = {word: freq/limit
for word,freq in counter.items()}
# Drop words if too common or too uncommon
word_frequencies = {word: freq
for word,freq in word_frequencies.items()
if freq > MIN_WORD_PROP
and freq < MAX_WORD_PROP}
return word_frequencies
def sentence_score(word_sentence, word_frequencies):
return sum([ word_frequencies.get(word,0)
for word in word_sentence])
def summarize(text:str, num_sentences=3):
"""
Summarize the text, by return the most relevant sentences
:text the text to summarize
:num_sentences the number of sentences to return
"""
# Make the text lowercase
text = text.lower()
# Break text into sentences
sentences = sent_tokenize(text)
# Break sentences into words
word_sentences = [word_tokenize(sentence)
for sentence in sentences]
# Compute the word frequencies
word_frequencies = compute_word_frequencies(word_sentences)
# Calculate the scores for each of the sentences
scores = [sentence_score(word_sentence, word_frequencies)
for word_sentence in word_sentences]
sentence_scores = list(zip(sentences, scores))
# Rank the sentences
top_sentence_scores = nlargest(num_sentences,
sentence_scores,
key=lambda t: t[1])
# Return the top sentences
return [t[0] for t in top_sentence_scores]
#분류 대상 파일을 읽어온다
ReadDataPd = df2.copy()
ReadDataPd=ReadDataPd.assign(summarize_word='')
counter=0
# for w in ReadDataPd['내용']:
for w in ReadDataPd['content']:
#클린징
ReadDoc_CleanText= text_cleaning(w)
#print("\nReadDoc_CleanText=",ReadDoc_CleanText[:100])
# n줄로 요약 생성
sum_res=summarize(ReadDoc_CleanText, num_sentences=3)
ReadDataPd.at[counter,'summarize_word']=sum_res
counter=counter+1
# path_to = creatFolder_in + td + '_'+ subMenu.strip() + '_' + test_bindo +'_'+ str(most_cnt) + '_textSummerFunc'+'.xlsx'
# ReadDataPd.to_excel(path_to)
return ReadDataPd
# textSummerFunc()
In [4]:
def gensim_wordSummarize(tts,tts1):
aa = []
bb = []
cc = []
li_tts = []
liwd_cnt = []
tts_cnt = 0
for i in range(len(tts)):
tts_cnt = i
len(tts[tts_cnt])
ttss = ''
for i in tts[tts_cnt]:
ttss = ttss + i
ttss1 = ''
for i in tts1[tts_cnt]:
ttss1 = ttss1 + i
liwd = word_tokenize(ttss) ##### gensim 단어 토큰화
liwd = [x for x in liwd if len(x) > 2]
liwd1 = word_tokenize(ttss1) ##### naver 단어 토큰화
liwd1 = [x for x in liwd1 if len(x) > 2]
a = Counter(liwd) # a.most_common # 빈도수(frequency)가 높은 순으로 상위
b = Counter(liwd1)
c = a & b # 교집합
c = c.most_common(most_cnt) # 교집합 빈도수
if len(liwd) == 0 :
biyur = 0
else:
biyur = round(len(c)/len(liwd),2)*100
biyur = int(biyur)
biyur = str(biyur) + '%' # 네이버요약 기준 , gensim 단어 매칭 비율
## a | b # 합집합에 대한 내용
aa.append(a.most_common(3)) # 상위 몇개 빼기
bb.append(b.most_common(3))
liwd_cnt.append(biyur)
cc.append(len(c)) # 교집합 빈도수
li_tts.append(c)
# df2['요약_단어빈도교집합'] = li_tts
df2['요약단어_교집합'] = li_tts
df2['naver_단어빈도'] = aa
df2['gensim_단어빈도'] = bb
df2['단어_빈도수'] = cc
df2['빈도_비율'] = liwd_cnt
df3 = df2.copy()
df3['단어매칭율'] = pd.to_numeric(df3['빈도_비율'].str.replace('%',''))
df3['단어매칭율'] = str(df3['단어매칭율'].mean()) + '%'
# df3 = df3[['subMenu','title','yoyag','naver_단어빈도','summarize_word','txdf_단어빈도','요약단어_교집합','빈도_비율','단어_빈도수','단어매칭율']]
df3 = df3[['subMenu','title','yoyag','naver_단어빈도','summary_yoyag','gensim_단어빈도','요약단어_교집합','빈도_비율','단어_빈도수','단어매칭율']]
for i in range(len(df3['요약단어_교집합'])):
if len(df3['요약단어_교집합'][i]) == 0 :
df3['요약단어_교집합'][i] = '무'
else:
pass
# df3 = df3.sort_values(by=['단어_빈도수'], axis = 0, ascending = False) ## ascending 정렬 교집합
# df3 = df3.sort_values(by=['빈도_비율'], axis = 0, ascending = False) ## ascending 정렬 빈도율
path_to = creatFolder_in + td + '_'+ subMenu.strip() + '_' + test_bindo +'_'+ str(most_cnt)+'_gensim_wordSummarize' +'.xlsx'
df3.to_excel(path_to)
return df3
In [5]:
def summary_wordSummarize(tts,tts1):
aa = []
bb = []
cc = []
li_tts = []
liwd_cnt = []
tts_cnt = 0
for i in range(len(tts)):
tts_cnt = i
len(tts[tts_cnt])
ttss = ''
for i in tts[tts_cnt]:
ttss = ttss + i
ttss1 = ''
for i in tts1[tts_cnt]:
ttss1 = ttss1 + i
liwd = word_tokenize(ttss) ##### gensim 단어 토큰화
liwd = [x for x in liwd if len(x) > 2]
liwd1 = word_tokenize(ttss1) ##### naver 단어 토큰화
liwd1 = [x for x in liwd1 if len(x) > 2]
a = Counter(liwd) # a.most_common # 빈도수(frequency)가 높은 순으로 상위
b = Counter(liwd1)
c = a & b # 교집합
c = c.most_common(most_cnt) # 교집합 빈도수
if len(liwd) == 0 :
biyur = 0
else:
biyur = round(len(c)/len(liwd),2)*100
biyur = int(biyur)
biyur = str(biyur) + '%' # 네이버요약 기준 , gensim 단어 매칭 비율
## a | b # 합집합에 대한 내용
aa.append(a.most_common(3)) # 상위 몇개 빼기
bb.append(b.most_common(3))
liwd_cnt.append(biyur)
cc.append(len(c)) # 교집합 빈도수
li_tts.append(c)
# df2['요약_단어빈도교집합'] = li_tts
txdf['요약단어_교집합'] = li_tts
txdf['naver_단어빈도'] = aa
txdf['txdf_단어빈도'] = bb
txdf['단어_빈도수'] = cc
txdf['빈도_비율'] = liwd_cnt
df3 = txdf.copy()
df3['단어매칭율'] = pd.to_numeric(df3['빈도_비율'].str.replace('%',''))
df3['단어매칭율'] = str(df3['단어매칭율'].mean()) + '%'
df3 = df3[['subMenu','title','yoyag','naver_단어빈도','summarize_word','txdf_단어빈도','요약단어_교집합','빈도_비율','단어_빈도수','단어매칭율']]
for i in range(len(df3['요약단어_교집합'])):
if len(df3['요약단어_교집합'][i]) == 0 :
df3['요약단어_교집합'][i] = '무'
else:
pass
# df3 = df3.sort_values(by=['단어_빈도수'], axis = 0, ascending = False) ## ascending 정렬 교집합
# df3 = df3.sort_values(by=['빈도_비율'], axis = 0, ascending = False) ## ascending 정렬 빈도율
path_to = creatFolder_in + td + '_'+ subMenu.strip() + '_' + test_bindo +'_'+ str(most_cnt)+'_summary_wordSummarize' +'.xlsx'
df3.to_excel(path_to)
return df3
In [6]:
path = './yoyag_test/'
checkFolder = os.path.isdir(path)
creatFolder_in = path
if checkFolder == True:
pass
else:
os.mkdir(creatFolder_in)
############ 폴더 생성 및 체크
tdd = datetime.today().strftime("%m%d")
checkFolder = os.path.isdir('./yoyag_test/'+'20'+tdd + '/')
creatFolder_in = './yoyag_test/'+'20'+tdd + '/'
if checkFolder == True:
pass
else:
os.mkdir(creatFolder_in)
bindo_li = {}
td = datetime.today().strftime("%Y%m%d") # 오늘 일자
path_dir = './news002/quanbu/'
path_list = os.listdir(path_dir)
path = path_dir + path_list[0]
In [7]:
subMenu = '' # 타겟 메뉴
sheet_cnt = 1 # 0 - 5 , 정치, 경제, 사회, 생활, 세계, it
most_cnt = 20 # 상위 교집합의 갯수 기준 테스트
test_bindo = '빈도_비율' # 빈도_비율, # 단어_빈도수 ,(교집합 테스트)
# test_bindo = '단어_빈도수' # 빈도_비율, # 단어_빈도수 ,(교집합 테스트)
# test_ratio = 0.5 # 30% 의 비율로 요약 요청
test_wordCnt = 25 # 단어수 20개로 지정
df = pd.read_excel(path, sheet_name=sheet_cnt)
df1 = df.copy()
df1['content'] = df1['content'].str.replace('◆|ⓒ','').str.strip()
df2 = df1[['subMenu','title','content','yoyag']]
##################################################################################################
li_summary = []
chk = 0
pd.set_option('mode.chained_assignment', None) # <==== 경고를 끈다
# ################################################### test001 요약단어_교집합 , gensim_단어요약 VS naver_단어요약
cnt_li = df2['content'].tolist()
subMenu = df2['subMenu'][0]
print(len(cnt_li))
print(cnt_li[73])
for i in cnt_li:
tts = text_cleaning(i)
chk = chk + 1
# print(chk)
# TotalWord = 단어수 세서 넣고
# TotalLine = 스플릿 해서 넣고
# 문장당 단어수 = (TotalWord/TotalLine) * 3.2
output1 = tts.replace('.','. ')
# output = summarize(output1,ratio=test_ratio) ## test_ratio
# output = summarize(output,word_count =test_wordCnt) ## test_wordCnt
output = summarize(output1,word_count =test_wordCnt) ## test_wordCnt
output = output.replace('다.','다._').split('_')
li_summary.append(output)
df2['summary_yoyag'] = pd.Series(li_summary)
tts = df2['summary_yoyag'].tolist()
tts1 = df2['yoyag'].tolist()
####################################################################### test002 요약단어_교집합 , summary_단어요약 VS naver_단어요약
txdf = textSummerFunc() # summarrize 함수를 사용
tts0 = txdf['summarize_word'].tolist()
tts01 = txdf['yoyag'].tolist()
subMenu = txdf['subMenu'][0]
#######################################################################
gensim_wordSummarize(tts,tts1)
# summary_wordSummarize(tts0,tts01)
Out[7]:
In [8]:
# subMenu = '' # 타겟 메뉴
# sheet_cnt = 1 # 0 - 5 , 정치, 경제, 사회, 생활, 세계, it
# most_cnt = 20 # 상위 교집합의 갯수 기준 테스트
# test_bindo = '빈도_비율' # 빈도_비율, # 단어_빈도수 ,(교집합 테스트)
# # test_ratio = 0.5 # 30% 의 비율로 요약 요청
# test_wordCnt = 25 # 단어수 20개로 지정
# df = pd.read_excel(path, sheet_name=sheet_cnt)
# df1 = df.copy()
# df1 = df1.drop_duplicates('content') #### 중복된 행 제거
# print(len(df1))
# df1['content'][0]
# # df1['content'] = df1['content'].str.replace('◆|ⓒ','').str.strip()
# # df2 = df1[['subMenu','title','content','yoyag']]
# # df2
In [ ]:
'Python' 카테고리의 다른 글
파이썬 정규표현식 관련, 범위지정 삭제, 수정 (0) | 2020.06.24 |
---|---|
200623-파이썬_004.009_headless_결측치제거(성공) (0) | 2020.06.24 |
200624-파이썬_004.010_headless_결측치제거 (0) | 2020.06.24 |
os파일생성_및_사용법 (0) | 2020.06.22 |
200619-파이썬_004.005_multiprocessing ,options_headless (0) | 2020.06.19 |
Comments