무회blog

200701-yoyag_test_010, 요약뉴스처리,파이썬, 전처리 본문

Python

200701-yoyag_test_010, 요약뉴스처리,파이썬, 전처리

최무회 2020. 7. 1. 13:18
200701-yoyag_test_010-Copy1
In [1]:
### 200701-yoyag_test_010
import pandas as pd 
import time, timeit, os, sys , re 
from gensim.summarization import summarize
from collections import Counter
import collections
from nltk.tokenize import sent_tokenize,word_tokenize
#############################################
from datetime import datetime
import numpy as np 
import nltk.stem, nltk.corpus, nltk.tokenize 
from newspaper import Article
#############################################
import tomotopy as tp 
#############################################
from gensim import corpora 
from gensim import models
from konlpy.utils import pprint
from kiwipiepy import Kiwi
from konlpy.tag import Hannanum
hannanum = Hannanum()
kiwi = Kiwi()
kiwi.prepare()
#############################################
from string import punctuation
from heapq import nlargest
In [2]:
# ##  # step_5 ,, 단어빈도 비율 결고 보려고 할때 테스트 가능 , ##  # step_2-1 에서 볼수 있음 .. 
# def summary_wordSummarize(tts,tts1):
#     aa = []
#     bb = []
#     cc = []
#     li_tts = []
#     liwd_cnt = []
#     tts_cnt = 0        
#     for i in range(len(tts)):
#         tts_cnt = i
#         len(tts[tts_cnt])
#         ttss = ''
#         for i in tts[tts_cnt]:
#             ttss = ttss + i

#         ttss1 = ''
#         for i in tts1[tts_cnt]:
#             ttss1 = ttss1 + i

#         liwd = word_tokenize(ttss)                           #####  gensim 단어 토큰화  
#         liwd = [x for x in liwd if len(x) > 2]
#         liwd1 = word_tokenize(ttss1)                        #####   naver 단어 토큰화  
#         liwd1 = [x for x in liwd1 if len(x) > 2]

#         a = Counter(liwd)  # a.most_common  # 빈도수(frequency)가 높은 순으로 상위
#         b = Counter(liwd1)
#         c = a & b                                                          # 교집합
#         c = c.most_common(most_cnt)                                                # 교집합 빈도수 

#         if len(liwd) == 0 :
#             biyur = 0
#         else:
#             biyur = round(len(c)/len(liwd),2)*100
#             biyur = int(biyur)
#         biyur = str(biyur) + '%'                 #  네이버요약 기준 , gensim 단어 매칭 비율 


#         ## a | b #  합집합에 대한 내용 
#         aa.append(a.most_common(3))   # 상위 몇개 빼기 
#         bb.append(b.most_common(3))

#         liwd_cnt.append(biyur)

#         cc.append(len(c))           # 교집합 빈도수 
#         li_tts.append(c)

#     # df2['요약_단어빈도교집합']  = li_tts
#     txdf['요약단어_교집합']   = li_tts
#     txdf['naver_단어빈도']    = aa
#     txdf['txdf_단어빈도']     = bb
#     txdf['단어_빈도수']       = cc
#     txdf['단어_일치율']         = liwd_cnt
# #     txdf['단어_일치율']         = liwd_cnt

#     df3 = txdf.copy()
#     df3['단어매칭율'] = pd.to_numeric(df3['단어_일치율'].str.replace('%',''))
#     df3['단어매칭율'] = str(df3['단어매칭율'].mean()) + '%'
    
#     df3 = df3[['subMenu','title','yoyag','naver_단어빈도','summarize_word','txdf_단어빈도','요약단어_교집합','단어_일치율','단어_빈도수','단어매칭율']]

#     for i in range(len(df3['요약단어_교집합'])):
#         if len(df3['요약단어_교집합'][i]) == 0 :
#                df3['요약단어_교집합'][i] = '무'
#         else:
#                pass
#     # df3 = df3.sort_values(by=['단어_빈도수'], axis = 0, ascending = False)  ## ascending 정렬  교집합 
# #     df3 = df3.sort_values(by=['단어_일치율'], axis = 0, ascending = False)  ## ascending 정렬   빈도율
#     path_to = creatFolder_in + td + '_'+ subMenu.strip() + '_' + test_bindo +'_'+ str(most_cnt)+'_summary_wordSummarize' +'.xlsx'    
#     df3.to_excel(path_to)
    
#     path=creatFolder_in               
#     path=os.path.realpath(path)
#     os.startfile(path)    
    
#     return df3
# ############################################################
# def textSummerFunc():
#     # punctuation는 [, ], ? 등 기호 리스트 이다.
#     STOPWORDS =  list(punctuation)

#     #문장에 나타나는 단어의 빈도 중 최소, 최대등 의미없는 단어를 제거하기 위한 변수
#     MIN_WORD_PROP, MAX_WORD_PROP = 0.1, 0.9

#     def compute_word_frequencies(word_sentences):
#         words = [word for sentence in word_sentences 
#                          for word in sentence 
#                              if word not in STOPWORDS]
#         counter = Counter(words)
#         limit = float(max(counter.values()))
#         word_frequencies = {word: freq/limit 
#                                     for word,freq in counter.items()}
#         # Drop words if too common or too uncommon
#         word_frequencies = {word: freq 
#                                 for word,freq in word_frequencies.items() 
#                                     if freq > MIN_WORD_PROP 
#                                     and freq < MAX_WORD_PROP}
#         return word_frequencies

#     def sentence_score(word_sentence, word_frequencies):
#         return sum([ word_frequencies.get(word,0) 
#                         for word in word_sentence])

#     def summarize(text:str, num_sentences=3):
#         """
#         Summarize the text, by return the most relevant sentences
#          :text the text to summarize
#          :num_sentences the number of sentences to return
#         """
#         # Make the text lowercase
#         text = text.lower()

#         # Break text into sentences 
#         sentences = sent_tokenize(text)

#         # Break sentences into words
#         word_sentences = [word_tokenize(sentence) 
#                               for sentence in sentences]

#         # Compute the word frequencies
#         word_frequencies = compute_word_frequencies(word_sentences)

#         # Calculate the scores for each of the sentences
#         scores = [sentence_score(word_sentence, word_frequencies)
#                          for word_sentence in word_sentences]
#         sentence_scores = list(zip(sentences, scores))

#         # Rank the sentences
#         top_sentence_scores = nlargest(num_sentences, 
#                                        sentence_scores,
#                                        key=lambda t: t[1])

#         # Return the top sentences
#         return [t[0] for t in top_sentence_scores]


#     #분류 대상 파일을 읽어온다
#     ReadDataPd = df2.copy()
#     ReadDataPd=ReadDataPd.assign(summarize_word='')
#     counter=0
#     # for w in ReadDataPd['내용']:
#     for w in ReadDataPd['content']:
#         #클린징
#         ReadDoc_CleanText= text_cleaning(w)
#         #print("\nReadDoc_CleanText=",ReadDoc_CleanText[:100])
#         # n줄로 요약 생성
#         sum_res=summarize(ReadDoc_CleanText, num_sentences=3)

#         ReadDataPd.at[counter,'summarize_word']=sum_res
#         counter=counter+1
# #     path_to = creatFolder_in + td + '_'+ subMenu.strip() + '_' + test_bindo +'_'+ str(most_cnt) + '_textSummerFunc'+'.xlsx'    
# #     ReadDataPd.to_excel(path_to)

#     return ReadDataPd    

# # textSummerFunc()    
In [3]:
td = datetime.today().strftime("%Y%m%d")     # 오늘 일자  
tdd = datetime.today().strftime("%m%d")

# # step_000 시작입니다. 
def zidongShengCheng():
    global path
    print('프로그램을 초기화 후 시작합니다. 함수명은 zidongShengCheng() 입니다. ')
    ## 폴더를 체크하고 만들기 
    print('start path , {}'.format(path))
    checkFolder = os.path.isdir(path)
    creatFolder_in = path
    creatFolder_in = path
    if checkFolder == True:
        pass
    else:
        os.mkdir(creatFolder_in)
    original_fileName = os.listdir(path)[0]          ## 폴더안에 파일을 읽어 들이기 
    ## 폴더안에 파일을 읽어 들이기 
    path_dir = path
    path_list = os.listdir(path_dir)
    path = path_dir +  path_list[0]

    rss = path_list[path_list.index(original_fileName)]               # 원본데이터 설정
    file_path = path_dir + rss

    ddf_li = [pd.read_excel(file_path, sheet_name =x) for x in range(6)]
    ddf = pd.concat(ddf_li)
    
    ## 데이터 프레임의 전처리 시작 
    ddf = ddf[['subMenu','title','content','yoyag']]

    sample_list = []
    ss = ddf['yoyag'].tolist()
    for i in ss:
        ts = i.replace('.','._').split('_')
        ts = list(filter(bool, ts))
        sample_list.append(ts)
    sample_listcnt = [len(sample_list[x]) for x in range(len(sample_list))]

    ddf['yoyag_cnt'] = sample_listcnt
    ddf = ddf[ddf['yoyag_cnt'] >= 3][['subMenu','title','content','yoyag','yoyag_cnt']]
    ddf = ddf[ddf['yoyag_cnt'] < 4][['subMenu','title','content','yoyag','yoyag_cnt']]
    ddf_group = ddf.sort_values(by="subMenu",ascending = False).groupby('subMenu').head(17)
    ddf_group = ddf_group.head(100)
    ddf_group= ddf_group[['subMenu','title','content','yoyag','yoyag_cnt']]
    ddf_group.to_excel('./news002/quanbu/news_100.xlsx',sheet_name='통합')
    
    return ddf_group
# zidongShengCheng() # 읽을때 실행하라우 
In [4]:
def df_cleaning(dfs_list):  # list DataFrame
    for x in range(rg_cnt-1):
        dd_li = []
        dfs_lists = dfs_list[x]
        for i in dfs_lists['content']:
            tts = text_cleaning(i)
            dd_li.append(tts)
        dfs_list[x]['content'] = dd_li
    return dfs_list


##  # step_4,   공통 코드 , 텍스트 클리닝 
def text_cleaning(text): 
##################################  gensim 사용을 위한 정규표현식 200624
    hangul_path9 = '[가-힣]+\.'                  # 한글로 포함되다 . 

    hangul_path0 = '[가-힣]+\.[가-힣]{1}'        # 한글 . + 한글 처리 
    hangul_path1 = '[가-힣]+\.[\d]{1}'           # 한글 . + 숫자 처리 [0-9]
    hangul_path2 = '[가-힣]+\.[a-z]{1}'          # 한글 . + 영어 소문자 
    hangul_path3 = '[가-힣]+\.[A-Z]{1}'          # 한글 . + 영어 대문자 
    hangul_path4 = '[가-힣]+\.[\S]{1}'           # 한글 . + 비공백 [^ \t\n\r\f\v]와 같다.    
    hangul_path5 = '[가-힣]+\.[\s]{1}'           # 한글 . + 공백 [ \t\n\r\f\v]와 같다.
    hangul_path6 = '[가-힣]+\.[\W]{1}'           # 한글 . + 숫자 + 문자가 아닌 것 [^a-zA-Z0-9]와 같다.
    hangul_path7 = '[가-힣]+\.[\w]{1}'           # 한글 . + 숫자 + 문자 [a-zA-Z0-9]와 같다.
    hangul_path8 = '[가-힣]+\.[\b]{1}'           # 한글 . + 단어 경계 (`\w`와 `\W`의 경계)

    reg_path = hangul_path0 + '|' + hangul_path1 + '|'+ hangul_path2 + '|'+ hangul_path3 + '|'+ hangul_path4+ '|'+ hangul_path5
    hangul = re.compile(reg_path)              # 한글 + 모모로 포함되다 . 

    result = hangul.findall(text)                                                   # 정규식에 일치되는 부분을 리스트 형태로 저장, 단어 반환 
    result = list(set(result))    
    for x in result:
        text = text.replace(x, x[:-1] + '\n' + x[-1:])

    ### 줄단위 좌우 공백 지우기 , 
    text = text.replace('\n','_').split('_')
    text = [x.strip() for x in text]
    tts = ''
    for i in text:
        tts = tts + i + '\n'
        text = tts        
    ##################################  gensim 사용을 위한 정규표현식 200624
    text = re.sub('\[.+?\]','', text)         # 대괄호 [] 이내 모든 문자 삭제     
    
    #이모티콘 제거
    EMOJI = re.compile('[\U00010000-\U0010ffff]', flags=re.UNICODE)
    text= EMOJI.sub(r'', text)
    #이메일 주소 제거
    email =re.compile('([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)')
    text = email.sub('', text) 
    #URL 제거
    url =re.compile('(http|ftp|https)://(?:[-\w.]|(?:%[\da-fA-F]{2}))+')
    text = url.sub('', text) 
    #HTML 제거
    html =re.compile('<[^>]*>')
    text = html.sub('', text) 

    #특수문자를 공백으로 대체(문장을 살리기위헤 마침표는 남겨둠)
    #special =re.compile('[^\w\s]')
    #text = special.sub(' ', text) 
    special= ['*', '{', ',', ':', ']', '$', '+', '[', '#', '(', '%', '&', '}', '`', '‘', '’','·',
                '=', ';', '>','>', '/', '"', '“', '”', '\\', '?', '~', "'", '<', ')', '^', '!', '_',
                '|', '@','@','©','ⓒ', '℗','®','①', '-','▶','…','☞','▲','◆','■'] #'.', 빼고
    for chr in special :
        text=text.replace(chr,' ')
    
        #특수문자 제거 후 생기는 중복된 공백 제거
        while text.find('  ') > 0:
            text = text.replace('  ',' ' ) # 중복된 공백 제거

        #특수문자 제거 후 생기는 중복된 개행 제거
        while text.find('\n\n') > 0:
            text = text.replace('\n\n','\n' ) # 중복된 개행 제거

        #좌우측 공백 삭제
        text = text.strip()

        # 좌측 공백 삭제
#         text.lstrip()

        # 우측 공백 삭제
        #text.rstrip() 

    return text 
In [5]:
##  # step_3
def gensim_wordSummarize(df2):
    tts = df2['summary_yoyag'].tolist()
    tts1 = df2['yoyag'].tolist()    
    aa = []
    bb = []
    cc = []
    li_tts = []
    liwd_cnt = []
    tts_cnt = 0        
    for i in range(len(tts)):
        tts_cnt = i
        ## summary_yoyag 요약 내용을 처리한다 . 
        ttss = ''
        for i in tts[tts_cnt]:
            ttss = ttss + i

        ## yoyag 요약 내용을 처리한다 .
        ttss1 = ''
        for i in tts1[tts_cnt]:
            ttss1 = ttss1 + i

        liwd = word_tokenize(ttss)                           #####  gensim 단어 토큰화  
        liwd = [x for x in liwd if len(x) > 2]
        liwd1 = word_tokenize(ttss1)                        #####   naver 단어 토큰화  
        liwd1 = [x for x in liwd1 if len(x) > 2]

        a = Counter(liwd)  # a.most_common  # 빈도수(frequency)가 높은 순으로 상위
        b = Counter(liwd1)
        c = a & b                                                          # 교집합
        c = c.most_common(most_cnt)                                                # 교집합 빈도수 

        if len(liwd) == 0 :
            biyur = 0
        else:
            biyur = round(len(c)/len(liwd),2)*100
            biyur = int(biyur)
        biyur = str(biyur) + '%'                 #  네이버요약 기준 , gensim 단어 매칭 비율 


        ## a | b #  합집합에 대한 내용 
        aa.append(a.most_common(3))   # 상위 몇개 빼기 
        bb.append(b.most_common(3))
        liwd_cnt.append(biyur)
        cc.append(len(c))           # 교집합 빈도수 
        li_tts.append(c)

    # df2['요약_단어빈도교집합']  = li_tts
    df2['요약단어_교집합']   = li_tts
    df2['naver_단어빈도']    = aa
    df2['gensim_단어빈도']   = bb
    df2['단어_빈도수']       = cc
    df2['단어_일치율']         = liwd_cnt

    df3 = df2.copy()
    df3['단어매칭율'] = pd.to_numeric(df3['단어_일치율'].str.replace('%',''))
    df3['단어매칭율'] = str(round(df3['단어매칭율'].mean(),2)) + '%'
    for i in range(len(df3['요약단어_교집합'])):
        if len(df3['요약단어_교집합'][i]) == 0 :
               df3['요약단어_교집합'][i] = '무'
        else:
               pass    
    df3 = df3[['subMenu','title','content','yoyag_cnt','gensimyy_cnt','단어_일치율','yoyag','summary_yoyag']]
################################################################################################    
    ##  gensim 줄수 3으로 매칭 
#     df3 = df3[df3['gensimyy_cnt'] >= 3][['subMenu','title','content','yoyag_cnt','gensimyy_cnt','단어매칭율','yoyag','summary_yoyag']]
#     df3 = df3[df3['gensimyy_cnt'] < 4 ][['subMenu','title','content','yoyag_cnt','gensimyy_cnt','단어매칭율','yoyag','summary_yoyag']]
    # df3 = df3.sort_values(by=['단어_빈도수'], axis = 0, ascending = False)  ## ascending 정렬
################################################################################################
    td = '20'+datetime.today().strftime("%m%d")     # 오늘 일자  
    paths = './yoyag_test/'+td + '/'
    path_to = paths +td+'_'+ str(most_cnt)+'_gensim_word요약본.xlsx'    
    df3.to_excel(path_to)
    return df3
In [6]:
##  # step_001
##################################################################################################
li_summary = []
li_summary2 = []
tts  = []
tts1 = []
chk = 0
subMenu    = ''                              # 타겟 메뉴 
sheet_cnt  = 0                               # 0 - 5 , 통합, 정치, 경제, 사회, 생활, 세계, it 
most_cnt   = 100                              #  상위 교집합의 갯수 기준 테스트 
# test_ratio = 0.5                             # 30% 의 비율로 요약 요청 

creatFolder_in = ''
td = ''

def start_fn():
    zidongShengCheng()
    path = './yoyag_test/'
    checkFolder = os.path.isdir(path)
    creatFolder_in = path
    if checkFolder == True:
        pass
    else:
        os.mkdir(creatFolder_in)
    ############ 폴더 생성 및 체크 
    checkFolder = os.path.isdir('./yoyag_test/'+'20'+tdd + '/')
    creatFolder_in = './yoyag_test/'+'20'+tdd + '/'
    if checkFolder == True:
        pass
    else:
        os.mkdir(creatFolder_in)
    # 최종 읽기경로찾기  읽기 경로   
    path_dir = './news002/quanbu/'
    path_list = os.listdir(path_dir)
    path = path_dir +  path_list[1]

    df = pd.read_excel(path, sheet_name=sheet_cnt)
    print(path)
    df2 = df.copy()
    df2 = df2[['subMenu','title','content','yoyag','yoyag_cnt']]
    pd.set_option('mode.chained_assignment',  None) # <==== 경고를 끈다    
    # ################################################### test001 요약단어_교집합 , gensim_단어요약 VS naver_단어요약
    cnt_li = df2['content'].tolist()    # content 를 처리한다 
    subMenu  = df2['subMenu'][0]
    li_summary_contnet = []
    chk = 0
    for i in cnt_li:                  # content 를 처리한다  , 요약까지 처리한다 
        chk = chk + 1
        output = text_cleaning(i)
        li_summary_contnet.append(output)             # content 저장 
    #     TotalWord = 단어수 세서 넣고
        TotalWord = word_tokenize(output)
        TotalWord_cnt = len(TotalWord)
    #     TotalLine = 스플릿 해서 넣고   # text_cleaning
        TotalLine = output.replace('. ','. _').split('_')                      # 한글 + .(점) 기준 줄바꿈처리, 줄바꿈기준 짜르기 
        TotalLine_cnt = len(TotalLine)
    #     문장당 단어수 = (TotalWord/TotalLine) * 3.2    
        St_wordCnt = (TotalWord_cnt/TotalLine_cnt) * 3.2
        St_wordCnt = int(St_wordCnt)                                
    #     output = summarize(output1,ratio=St_wordCnt)                           ## test_ratio
        output = summarize(output,word_count = St_wordCnt)                   ##  정제된 데이터(content)로 문장을 요약한다. 
        li_summary.append(output)
    df2['summary_yoyag'] = pd.Series(li_summary)
    df2['content'] = pd.Series(li_summary_contnet)
    for i in df2['yoyag']:
        output = text_cleaning(i)
        li_summary2.append(output)
    df2['yoyag'] = pd.Series(li_summary2)    
    # gensim 요약 줄수 카운팅 
    sample_list = []
    ss = df2['summary_yoyag'].tolist()
    for i in ss:
        ts = i.replace('.','._').split('_')
        ts = list(filter(bool, ts))
        sample_list.append(ts)

    sample_listcnt = [len(sample_list[x]) for x in range(len(sample_list))]
    df2['gensimyy_cnt'] = sample_listcnt

    # 젠심 함수 호출 
    df = gensim_wordSummarize(df2)
    return df
In [7]:
 # 파일넣고 시작하는 경로 
path =  './news002/quanbu/'  

# 비지니스로직 시작 
start_fn()

# 끝나고 나서 자동으로 폴더를 열기 
start_directory = r'D:\app_src\anaconda\04-srcTest\yoyag_test\200701' 
os.startfile(start_directory)
####################################################################### test002 요약단어_교집합 , summary_단어요약 VS naver_단어요약
# gensim_wordSummarize(df2)
##  # step_2-1
# txdf      = textSummerFunc()                       # summarrize 함수를 사용 
# tts0      = txdf['summarize_word'].tolist()
# tts01     = txdf['yoyag'].tolist()
# subMenu   = txdf['subMenu'][0]
# summary_wordSummarize(tts0,tts01)  
####################################################################### 
    
프로그램을 초기화 후 시작합니다. 함수명은 zidongShengCheng() 입니다. 
start path , ./news002/quanbu/
./news002/quanbu/news_100.xlsx
Comments