무회blog

python: 200616-파이썬, writeFile_NewsContent-004.002-Copy1 본문

Python

python: 200616-파이썬, writeFile_NewsContent-004.002-Copy1

최무회 2020. 6. 16. 19:51
# 200616-파이썬, writeFile_NewsContent-004.002
# parameter 변경 후 테스트 
# No parts
# ToDayURL   ->    https://news.naver.com/main/list.nhn?mode=LSD&mid=sec&sid1=001&date=20200612&page=1

import time ,re , timeit
import pandas as pd
import requests as rq
import selenium as se
from selenium import webdriver
from bs4 import BeautifulSoup
from bs4 import SoupStrainer
from selenium import webdriver
from datetime import datetime
driver  = webdriver.Chrome("C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe") 
# 요약봇 selector -> #main_content > div.article_header > div.article_info > div > div.article_btns > div.article_btns_right > div.media_end_head_autosummary._auto_summary_wrapper > a
slc_yoyag = '#main_content > div.article_header > div.article_info > div > div.article_btns > div.article_btns_right > div.media_end_head_autosummary._auto_summary_wrapper > a'
slc_yoyag_text = '#main_content > div.article_header > div.article_info > div > div.article_btns > div.article_btns_right > div.media_end_head_autosummary._auto_summary_wrapper > div > div.media_end_head_autosummary_layer_body > div._contents_body'

df_links = pd.read_excel('./naversogbo_links.xlsx')
read_sheet2 = pd.read_excel('./naversogbo_links.xlsx', sheet_name = 'Sheet2')

pd_links = read_sheet2['links']        # 전체 파트별 링크 
pg_links = read_sheet2['page_link']    # 전체 파트별  첫번째 기사 링크 
# td_links = df_links['today_links']      # 오늘뉴스 속보 전체 링크 

pd_link = pd_links.tolist()
pg_link = pg_links.tolist()
# td_link = td_links.tolist()

lis = []

class testNews:
    def excuteFunc():
        global tg_cnt                                   # 타겟 분야
        global nanugi                                   # 몇 페이지를 돌리고 싶은가 ? 
        global nanugi_bijiao                             
        global rg_cnt                                   # 몇개 분야를 돌리는가 ?         
        print('분야별 tg_cnt,{}'.format(tg_cnt))
        print(nanugi_bijiao)

        ## 타이틀 이름 가져오기 
        pdg = pd_link[tg_cnt] + pg_link[tg_cnt]
        driver.get(pdg)
        pdg_src = driver.page_source
        pdg_src = BeautifulSoup(pdg_src, 'html.parser')
        if tg_cnt == 0:
            pdg_src = pdg_src.select('#snb > h2 > a')
            pdg_src = '전체' + pdg_src[0].get_text()
            pdg_src = re.sub("[-=.#/?:$}]",'_', pdg_src)
            pdg_src_title = pdg_src
        else:
            pdg_src = pdg_src.select('#snb > ul > li.on > a')
            pdg_src = pdg_src[0].get_text()
            pdg_src = re.sub("[-=.#/?:$}]",'_', pdg_src)
            pdg_src_title = pdg_src[:-3]
            
        print('pdg_src_title,{}'.format(pdg_src_title))
            
        ### 링크 마지막 페이지 카운트 가져오기 , 마지막 링크 가져오기
        qianzhui = pd_link[tg_cnt]
        tg_link  = pg_link[tg_cnt]     #  파트별기사 링크 &date=20200613&page=1

        qian = tg_link.find('=')+1
        qian1 = tg_link[:qian] 
        qian2_riqi = tg_date
        qian3 = qian1 + qian2_riqi
        hou = tg_link[len(qian3):]
        hou1 =  hou[:hou.find('=')+1]
        hou2_yeshu = paging
        hou3 = hou1 + hou2_yeshu
        hou4 = hou3[:hou3.find('=')+1]
        houzhui1 = qian3 + hou3
        houzhui = qian3 + hou4
        qh_link1 = qianzhui + houzhui1
        qh_link = qianzhui + houzhui
        
        driver.get(qh_link1)
        html = driver.page_source
        html = BeautifulSoup(html,'html.parser')
        html = html.select('#main_content > div.paging > strong')
        totalPg_cnt = int(html[0].get_text())         # 전체 페이지 갯수
        
        print('totalPg_cnt, {}'.format(totalPg_cnt))
        ####  지정한 만큼 들오가는지 체크 
        if nanugi < nanugi_bijiao:                                   # 100 페이지 x 20(15~20)개 뉴스  = 1500개 뉴스 
            if totalPg_cnt <= nanugi:   ####  토탈페이지가 지정한 수보다 작을때 전처리 , nanugi = 100, totalPg_cnt = 60 일때 
                nanugi = totalPg_cnt 
                totalPg_cnt_nanugi = nanugi                        # 지정페이지 만큼 
            else:                       ####  토탈페이지가 지정한 수보다 클때 전처리 , nanugi = 20, totalPg_cnt = 60 일때 
                totalPg_cnt_nanugi = nanugi                        # 지정페이지 만큼 
        else:
            totalPg_cnt_nanugi = int(totalPg_cnt)              # 해당 페이지수만큼 가져오기 
            nanugi = totalPg_cnt_nanugi
############################################################################################
        print('totalPg_cnt_nanugi, {}'.format(totalPg_cnt_nanugi)) 
        li00 = []
        li01 = []
        li02 = []
        li03 = []
        li04 = []
        frmNews={} 
        print('초기화용 frmNews,{}'.format(len(frmNews)))
        cked = 0
        ### 원하는 페이지 만큼 돌려서 리스트에 담기 
        for j in range(totalPg_cnt_nanugi):
            url = qh_link + str(j+1)
            driver.get(url)
            html = driver.page_source
            html = BeautifulSoup(html,'html.parser')
            html_body = html.body
            ht_header = html_body.select('#main_content > div.list_body.newsflash_body > ul.type06_headline')
            ht_header = str(ht_header)
            ht_header = BeautifulSoup(ht_header,'html.parser')
            ht_ul     =ht_header.find_all('a')
            
            ht_headerb = html_body.select('#main_content > div.list_body.newsflash_body > ul.type06')
            ht_headerb = str(ht_headerb)
            ht_headerb = BeautifulSoup(ht_headerb,'html.parser')
            ht_ulb     =ht_headerb.find_all('a')

            llsa = []
            for i in ht_ul:
                tt = i.get('href')
                llsa.append(tt)
                
            for i in ht_ulb:
                tt = i.get('href')
                llsa.append(tt)
            llsb = list(set(llsa))

            llsc = llsb
            # # complete ways02
            for u3 in llsc:
                lst_html = driver.get(u3)
                lst_html = driver.page_source
                lst_html = BeautifulSoup(lst_html,'html.parser')
                lst_html = lst_html.body
                click_yoyag = driver.find_elements_by_css_selector(slc_yoyag)
                if len(click_yoyag) > 0 and click_yoyag[0].is_displayed():
                    _logo   = lst_html(class_="press_logo")[0].find('img').get('alt')
                    _title  = lst_html.select('#articleTitle')[0].get_text()
                    _content= lst_html.select('#articleBodyContents')[0].get_text().strip()

                    li00.append(pdg_src_title)  ###################### df 값 입력하기 
                    li01.append(_logo) 
                    li02.append(_title)
                    li03.append(_content)
                    
                    click_yoyag[0].click()
                    time.sleep(1)
                    html = driver.page_source
                    html = BeautifulSoup(html,'html.parser')
                    html = html.body.select(slc_yoyag_text)
                    for i in html:
                        _press_yoyag = i.get_text()
                        li04.append(_press_yoyag)
                        
                    cked = cked +1
                    print(" j= {}, cked= {}".format(j,cked) , end= ',')
#                     if cked > 5:
#                         break
                else:
                    continue
        frmNews = {'subMenu':li00, 'newsFrom':li01, 'title':li02, 'content':li03 , 'yoyag':li04}
        print('-'*50)
        print('리스트용 frmNews,{}'.format(len(frmNews)))
        df = pd.DataFrame(frmNews)
        df['content']=df['content'].str.replace("// flash 오류를 우회하기 위한 함수 추가\n","").str.replace("function _flash_removeCallback()","")
        df['content']=df['content'].str[3:].str.replace("\n","").str.replace("\t","").str.replace("{}","").str.strip()
        excel_nm = td+'_'+str(nanugi)+'_'+str(tg_cnt)+ '.xlsx'
        if nanugi < 499 :
            df.to_excel('./news001/news01/'+excel_nm)
        else:
            df.to_excel('./news001/news02/'+excel_nm)
    
############################################################################
## 분야 
# 0  전체              # 1  정치속보        # 2  경제 속보        # 3  사회 속보     
# 4  생활/문화 속보    # 5  세계 속보       # 6  IT/과학 속보     # 7  오피니언 속보        
############################################################################
## 일부분 돌리기
    def start_switch():
#         for z in range(6,8):
        for z in range(rg_cnt):
            global tg_cnt
            tg_cnt   =  z            # 분야별
            print('일부분 추출 tg_cnt, {}'.format(tg_cnt))
            print('start분야별 ,{}'.format(tg_cnt))
            testNews.excuteFunc()
        second_run()
############################################################################
##  전체 돌리기
    def start_switch2():
        for z in range(rg_cnt):
            global nanugi_bijiao
            nanugi_bijiao = 500
            nanugi = 500          
            tg_cnt   =  z         
            testNews.excuteFunc()
        second_run1()
############################################################################
## 한번 돌릴때 
    def start_switch1():
        nanugi = 1                   
        testNews.excuteFunc()
        select_First()
 
############################################################################
print("#"*50)
time.sleep(2)
sheet_reNm = []
###########################################################################
## 한번 돌릴때 , 지정된 분야 , 
def select_First():
    path = './news001/news01/'
    excel_nm = td+'_'+str(nanugi)+'_'+str(tg_cnt)+ '.xlsx'
    print(type(excel_nm))
    df00 = pd.read_excel(path+excel_nm)
    sheet_reNm.append(df00['subMenu'][0])
    print('sheet_reNm,{}'.format(sheet_reNm)) 
    excel_wrNm = './news002/'+str(excel_nm)
    print('excel_wrNm,{}'.format(excel_wrNm)) 
    
    writer = pd.ExcelWriter(excel_wrNm, engine = 'xlsxwriter')
    df00['subMenu'] = sheet_reNm[0]
    df00.to_excel(writer, sheet_name = sheet_reNm[0])    
    writer.save()
    writer.close()    
    print("success"+excel_wrNm)
    print('end_now, {} 초'.format(start_now))
    
##################################################
#     ## 일부분 돌릴때 , 성공    
def second_run():
#     excel_name = td+'_'+str(nanugi)+'_'+str(tg_cnt)+ '.xlsx'
    path = './news001/news01/'
    ex_nm = [path + td+'_'+str(nanugi)+'_'+str(i)+ '.xlsx' for i in range(rg_cnt)]                     # 파일경로들 
    dfs = [pd.read_excel(x) for x in ex_nm]                                             # 엑셀 파일 읽기 
    sheet_reNms = []                                                                    # 시트명 정하기 
    for i,j in enumerate(dfs):
        sheet_reNms.append(j['subMenu'][0])
    excel_reNm = './news002/yibufen/'+td +'_yibufen_nv.xlsx'                       # 경로 설정 
    # sheet_reNms[4] = re.sub("[-=.#/?:$}]",'_', sheet_reNms[4])                   # 특수문자 제거 , 가끔 에러 
    writer = pd.ExcelWriter(excel_reNm, engine = 'xlsxwriter')
    for i,j in enumerate(dfs):
        j.to_excel(writer, sheet_name = sheet_reNms[i])
    writer.save()
    writer.close()
    print("success_writer")    
    print('end_now, {} 초'.format(start_now))
    
##################################################
#     ##  전체 돌리기
def second_run1():
#     excel_name = td+'_'+str(nanugi)+'_'+str(tg_cnt)+ '.xlsx'
    path = './news001/news02/'
    ex_nm = [path + td+'_'+str(nanugi)+'_'+str(i)+ '.xlsx' for i in range(rg_cnt)]                     # 파일경로들 
    dfs = [pd.read_excel(x) for x in ex_nm]                                             # 엑셀 파일 읽기 
    sheet_reNms = []                                                                    # 시트명 정하기 
    for i,j in enumerate(dfs):
        sheet_reNms.append(j['subMenu'][0])
    excel_reNm = './news002/quanbu/'+td +'_quanbu_nv.xlsx'                       # 경로 설정 
    # sheet_reNms[4] = re.sub("[-=.#/?:$}]",'_', sheet_reNms[4])                   # 특수문자 제거 , 가끔 에러 
    writer = pd.ExcelWriter(excel_reNm, engine = 'xlsxwriter')
    for i,j in enumerate(dfs):
        j.to_excel(writer, sheet_name = sheet_reNms[i])
    writer.save()
    writer.close()
    print("success_writer")    
    print('end_now, {} 초'.format(start_now))    
## 디폴트값 
td = datetime.today().strftime("%Y%m%d")
tg_date       =  td                  # 날짜 
paging        = '1001'               # 랜덤 테스트 페이지 
tg_cnt        =  1                   # 분야
nanugi        =  2                   # 추출 페이지수 지정  , 2 페이지 이상 
nanugi_bijiao =  100                 # nanugi 분기처리시 사용 범위   
rg_cnt        =  8                   # 분야별 for 문 돌리려는 갯수
############################################################################
start = timeit.default_timer()   
start_now = datetime.today().strftime("%Y%m%d_%H:%M:%S")
print('start_now, {} 초'.format(start_now))

############################################################################

testNews.start_switch()     # 일부분 돌리기 
time.sleep(10)
testNews.start_switch2()     # 전체 돌리기 

# testNews.start_switch1()   # 하나 돌리기 
############################################################################


end = timeit.default_timer()    
endTime= int(end-start)
print('endTime, {} 초'.format(endTime))
endTimeMin = int(endTime/60)
print('endTimeMin, {} 분'.format(endTimeMin))
Comments