250x250
Notice
Recent Posts
Recent Comments
일 | 월 | 화 | 수 | 목 | 금 | 토 |
---|---|---|---|---|---|---|
1 | 2 | |||||
3 | 4 | 5 | 6 | 7 | 8 | 9 |
10 | 11 | 12 | 13 | 14 | 15 | 16 |
17 | 18 | 19 | 20 | 21 | 22 | 23 |
24 | 25 | 26 | 27 | 28 | 29 | 30 |
Tags
- test
- spring MVC(모델2)방식
- Websocket
- jsp 파일 설정
- 게시판 만들기
- oracle
- 코사인 유사도
- word2vec
- tomoto
- 크롤링
- 네이버뉴스
- db
- 지마켓
- 과학백과사전
- r
- 자바
- Python
- 幼稚园杀手(유치원킬러)
- Gmarket
- (깃)git bash
- 이력서
- java
- 방식으로 텍스트
- RESFUL
- Topics
- pytorch
- 토픽추출
- lda
- 파이썬
- mysql
Archives
- Today
- Total
무회blog
200624-파이썬_004.010_headless_결측치제거_ddf_yoyag_cnt추가_news 본문
In [1]:
# 200624-파이썬_004.010_headless_결측치제거
# parameter 변경 후 테스트
# No parts
# ToDayURL -> https://news.naver.com/main/list.nhn?mode=LSD&mid=sec&sid1=001&date=20200612&page=1
# Multiprocessing으로 병렬 크롤링하기
from newspaper import Article
from multiprocessing import Pool
import time ,re , timeit ,os
import pandas as pd
import requests as rq
import selenium as se
from selenium import webdriver
from bs4 import BeautifulSoup
from bs4 import SoupStrainer
from selenium import webdriver
from datetime import datetime
td = datetime.today().strftime("%Y%m%d")
tdd = datetime.today().strftime("%m%d")
now = datetime.now()
tdnow = now.strftime('%Y%m%d%H%M%S')
options = webdriver.ChromeOptions()
options.add_argument('headless')
options.add_argument('window-size=1920x1080')
options.add_argument("disable-gpu")
# 혹은 options.add_argument("--disable-gpu")
driver = webdriver.Chrome("C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe", options = options)
############ 폴더 생성 및 체크
checkFolder = os.path.isdir('./news001/'+'20'+tdd + '/')
checkFolder00 = os.path.isdir('./news002/yibufen/'+'20'+tdd + '/')
checkFolder01 = os.path.isdir('./news002/quanbu/'+'20'+tdd + '/')
creatFolder_in = './news001/'+'20'+tdd + '/'
creatFolder_out00 = './news002/yibufen/'+'20'+tdd + '/'
creatFolder_out01 = './news002/quanbu/'+'20'+tdd + '/'
if checkFolder == True:
pass
else:
os.mkdir(creatFolder_in)
if checkFolder00 == True:
pass
else:
os.mkdir(creatFolder_out00)
if checkFolder01 == True:
pass
else:
os.mkdir(creatFolder_out01)
In [2]:
# 요약봇 selector -> #main_content > div.article_header > div.article_info > div > div.article_btns > div.article_btns_right > div.media_end_head_autosummary._auto_summary_wrapper > a
slc_yoyag = '#main_content > div.article_header > div.article_info > div > div.article_btns > div.article_btns_right > div.media_end_head_autosummary._auto_summary_wrapper > a'
slc_yoyag_text = '#main_content > div.article_header > div.article_info > div > div.article_btns > div.article_btns_right > div.media_end_head_autosummary._auto_summary_wrapper > div > div.media_end_head_autosummary_layer_body > div._contents_body'
pd_link = [
'https://news.naver.com/main/list.nhn?mode=LSD&mid=sec&sid1=001' # 전체
,'https://news.naver.com/main/list.nhn?mode=LSD&mid=sec&sid1=100' # 정치속보
,'https://news.naver.com/main/list.nhn?mode=LSD&mid=sec&sid1=101' # 경제 속보
,'https://news.naver.com/main/list.nhn?mode=LSD&mid=sec&sid1=102' # 사회 속보
,'https://news.naver.com/main/list.nhn?mode=LSD&mid=sec&sid1=103' # 생활/문화 속보
,'https://news.naver.com/main/list.nhn?mode=LSD&mid=sec&sid1=104' # 세계 속보
,'https://news.naver.com/main/list.nhn?mode=LSD&mid=sec&sid1=105' # IT/과학 속보
,'https://news.naver.com/main/list.nhn?mode=LSD&mid=sec&sid1=110' # 오피니언 속보
]
pg_link = [
'&date='+td+'&page=1'
,'&date='+td+'&page=1'
,'&date='+td+'&page=1'
,'&date='+td+'&page=1'
,'&date='+td+'&page=1'
,'&date='+td+'&page=1'
,'&date='+td+'&page=1'
,'&date='+td+'&page=1'
]
pg_link
lis = []
class testNews:
def excuteFunc():
global tg_cnt # 타겟 분야
global nanugi # 몇 페이지를 돌리고 싶은가 ?
global nanugi_bijiao
global rg_cnt # 몇개 분야를 돌리는가 ?
# print('분야별 tg_cnt,{}'.format(tg_cnt))
## 타이틀 이름 가져오기
pdg = pd_link[tg_cnt] + pg_link[tg_cnt]
driver.get(pdg)
pdg_src = driver.page_source
pdg_src = BeautifulSoup(pdg_src, 'html.parser')
if tg_cnt == 0:
pdg_src = pdg_src.select('#snb > h2 > a')
pdg_src = '전체' + pdg_src[0].get_text()
pdg_src = re.sub("[-=.#/?:$}]",'_', pdg_src)
pdg_src_title = pdg_src
else:
pdg_src = pdg_src.select('#snb > ul > li.on > a')
pdg_src = pdg_src[0].get_text()
pdg_src = re.sub("[-=.#/?:$}]",'_', pdg_src)
pdg_src_title = pdg_src[:-3]
print('pdg_src_title, 분야 ---> {}'.format(pdg_src_title))
### 링크 마지막 페이지 카운트 가져오기 , 마지막 링크 가져오기
qianzhui = pd_link[tg_cnt]
tg_link = pg_link[tg_cnt] # 파트별기사 링크 &date=20200613&page=1
qian = tg_link.find('=')+1
qian1 = tg_link[:qian]
qian2_riqi = tg_date
qian3 = qian1 + qian2_riqi
hou = tg_link[len(qian3):]
hou1 = hou[:hou.find('=')+1]
hou2_yeshu = paging
hou3 = hou1 + hou2_yeshu
hou4 = hou3[:hou3.find('=')+1]
houzhui1 = qian3 + hou3
houzhui = qian3 + hou4
qh_link1 = qianzhui + houzhui1
qh_link = qianzhui + houzhui
driver.get(qh_link1)
html = driver.page_source
html = BeautifulSoup(html,'html.parser')
html = html.select('#main_content > div.paging > strong')
totalPg_cnt = int(html[0].get_text()) # 전체 페이지 갯수
#########################################################################################################################
totalPg_cnt_nanugi = int(totalPg_cnt) # 해당 페이지수만큼 가져오기
nanugi = totalPg_cnt_nanugi
li00 = []
li01 = []
li02 = []
li03 = []
li04 = []
frmNews={}
cked = 0
print('초기화용 frmNews,{} nanugi_bijiao,{}'.format(len(frmNews), nanugi_bijiao))
print('nanugi,{} totalPg_cnt, {} totalPg_cnt_nanugi, {}'.format(nanugi, totalPg_cnt , totalPg_cnt_nanugi))
#########################################################################################################################
### 원하는 페이지 만큼 돌려서 리스트에 담기
naljiBreak = True
for j in range(totalPg_cnt_nanugi):
if naljiBreak == False:
break
url = qh_link + str(j+1)
driver.get(url)
html = driver.page_source
html = BeautifulSoup(html,'html.parser')
html_body = html.body
ht_header = html_body.select('#main_content > div.list_body.newsflash_body > ul.type06_headline')
ht_header = str(ht_header)
ht_header = BeautifulSoup(ht_header,'html.parser')
ht_ul =ht_header.find_all('a')
ht_headerb = html_body.select('#main_content > div.list_body.newsflash_body > ul.type06')
ht_headerb = str(ht_headerb)
ht_headerb = BeautifulSoup(ht_headerb,'html.parser')
ht_ulb =ht_headerb.find_all('a')
llsa = []
for i in ht_ul:
tt = i.get('href')
llsa.append(tt)
for i in ht_ulb:
tt = i.get('href')
llsa.append(tt)
llsb = list(set(llsa))
llsc = llsb
# # complete ways02
for u3 in llsc:
lst_html = driver.get(u3)
lst_html = driver.page_source
lst_html = BeautifulSoup(lst_html,'html.parser')
lst_html = lst_html.body
click_yoyag = driver.find_elements_by_css_selector(slc_yoyag)
if len(click_yoyag) > 0 and click_yoyag[0].is_displayed():
_logo = lst_html(class_="press_logo")[0].find('img').get('alt')
_title = lst_html.select('#articleTitle')[0].get_text()
_content= lst_html.select('#articleBodyContents')[0].get_text().strip()
###############################################
li00.append(pdg_src_title) ###################### df 값 입력하기
li01.append(_logo)
li02.append(_title)
li03.append(_content)
click_yoyag[0].click()
time.sleep(1)
html = driver.page_source
html = BeautifulSoup(html,'html.parser')
html = html.body.select(slc_yoyag_text)
for i in html:
_press_yoyag = i.get_text()
li04.append(_press_yoyag)
cked = cked + 1
print(" j {} cked={}".format(j,cked) , end= ',')
#########################################################################################################################
# 전체 돌릴때 적용 되는 범위지정 , nanugi_bijiao
if nanugi_bijiao > 500:
pass
else: # 지정하여 사용 갯수 출력 할때
if cked >= nanugi_bijiao:
naljiBreak = False
break
#########################################################################################################################
else:
continue
frmNews = {'subMenu':li00, 'newsFrom':li01, 'title':li02, 'content':li03 , 'yoyag':li04}
df = pd.DataFrame(frmNews)
df['content']=df['content'].str.replace("// flash 오류를 우회하기 위한 함수 추가\n","").str.replace("function _flash_removeCallback()","")
df['content']=df['content'].str[3:].str.replace("\n","").str.replace("\t","").str.replace("{}","").str.strip()
excel_nm = tdnow+'_'+str(tg_cnt)+'_'+str(nanugi)+ '.xlsx'
df.to_excel(creatFolder_in +excel_nm)
print('리스트용 frmNews,{}'.format(len(frmNews)))
print('-'*50)
############################################################################
## 분야
# 0 전체 # 1 정치속보 # 2 경제 속보 # 3 사회 속보
# 4 생활/문화 속보 # 5 세계 속보 # 6 IT/과학 속보 # 7 오피니언 속보
############################################################################
## 일부분 돌리기
def start_switch():
for z in range(rg_cnt):
if z == 0 :
continue
global tg_cnt
tg_cnt = z # 분야별
print('start_switch tg_cnt, {}'.format(tg_cnt))
testNews.excuteFunc()
second_run()
############################################################################
## 전체 돌리기
def start_all():
for z in range(rg_cnt):
# # if z < 7:
if z == 0:
continue
global nanugi_bijiao
global tg_cnt
tg_cnt = z
print('start_all tg_cnt, {}'.format(tg_cnt))
testNews.excuteFunc()
second_all()
############################################################################
## 한번 돌릴때
def start_switch1():
print('start_switch1 ')
testNews.excuteFunc()
select_First()
In [3]:
############################################################################
print("#"*50)
time.sleep(2)
sheet_reNm = []
path = creatFolder_in
###########################################################################
## 결측치 값 행 빼기
def drop_naValue():
os.getcwd()
os_list = os.listdir(creatFolder_out01)
os_list.sort(reverse = True) # 내림차순
filename = os_list[0]
path1 = creatFolder_out01
path2= path1 + filename
sheet_name = 6 # 6까지 있음
print(path2)
print('-'*50)
# dfs_list = [pd.read_excel(path2 , sheet_name = x).drop_duplicates('content').dropna(axis=0) for x in range(rg_cnt-1)] # 결측값 행 빼기 # 엑셀 파일 읽기
dfs_list = [pd.read_excel(path2 , sheet_name = x).drop_duplicates('content').dropna(axis=0).head(100) for x in range(rg_cnt-1)] # 결측값 행 빼기 # 엑셀 파일 읽기
sheet_reNms = [] # 시트명 정하기
for i,j in enumerate(dfs_list):
sheet_reNms.append(j['subMenu'][0])
excel_reNm = creatFolder_out01+'/'+tdnow +'_Replace'+'_quanbu_nv.xlsx' # 경로 설정
writer = pd.ExcelWriter(excel_reNm, engine = 'xlsxwriter')
############ 메뉴명 정의
for i,j in enumerate(dfs_list):
j = j[['subMenu','newsFrom','title','content','yoyag']]
j.to_excel(writer, sheet_name = sheet_reNms[i])
writer.save()
writer.close()
print("success_writer_Replace")
####################################################################################################
time.sleep(2)
ddf = pd.concat([pd.read_excel(excel_reNm, sheet_name = x) for x in range(6)])
# ddf = pd.concat([pd.read_excel('./news002/quanbu/20200625082044_Replace_quanbu_nv.xlsx', sheet_name = x).head(20) for x in range(6)])
ddf = ddf[['subMenu','title','content','yoyag']]
sample_list = []
ss = ddf['yoyag'].tolist()
for i in ss:
ts = i.replace('.','._').split('_')
ts = list(filter(bool, ts))
sample_list.append(ts)
sample_listcnt = [len(sample_list[x]) for x in range(len(sample_list))]
ddf['yoyag_cnt'] = sample_listcnt
ddf = ddf[ddf['yoyag_cnt'] >= 3][['subMenu','title','content','yoyag','yoyag_cnt']]
ddf = ddf[ddf['yoyag_cnt'] < 4][['subMenu','title','content','yoyag','yoyag_cnt']]
ddf
ddf.to_excel('./news002/quanbu/news_100.xlsx',sheet_name='통합')
path=creatFolder_out01 # creatFolder_out01 , excel_reNm
path=os.path.realpath(path)
os.startfile(path)
###########################################################################
## 한번 돌릴때 , 지정된 분야 ,
def select_First():
excel_nm = tdnow+'_'+str(tg_cnt)+'_'+str(nanugi)+'_select_First' + '.xlsx'
print(type(excel_nm))
df00 = pd.read_excel(path+excel_nm)
sheet_reNm.append(df00['subMenu'][0])
excel_wrNm = './news002/'+str(excel_nm)
print('sheet_reNm,{}'.format(sheet_reNm))
print('excel_wrNm,{}'.format(excel_wrNm))
writer = pd.ExcelWriter(excel_wrNm, engine = 'xlsxwriter')
df00['subMenu'] = sheet_reNm[0]
df00.to_excel(writer, sheet_name = sheet_reNm[0])
writer.save()
writer.close()
print("success_writer")
##################################################
# ## 일부분 돌릴때 , 성공
def second_run():
##################################################
ex_nm = os.listdir(path)
dfs = [pd.read_excel(path+x) for x in ex_nm] # 엑셀 파일 읽기
sheet_reNms = [] # 시트명 정하기
for i,j in enumerate(dfs):
sheet_reNms.append(j['subMenu'][0])
excel_reNm = './news002/yibufen/'+tdnow +'_yibufen_nv.xlsx' # 경로 설정
writer = pd.ExcelWriter(excel_reNm, engine = 'xlsxwriter')
for i,j in enumerate(dfs):
j.to_excel(writer, sheet_name = sheet_reNms[i])
writer.save()
writer.close()
print("success_writer")
##################################################
# ## 전체 돌리기 excel_nm = td+'_'+str(nanugi)+'_'+str(tg_cnt)+ '.xlsx'
def second_all():
ex_nm = os.listdir(path)
dfs = [pd.read_excel(path+x) for x in ex_nm] # 엑셀 파일 읽기
sheet_reNms = [] # 시트명 정하기
for i,j in enumerate(dfs):
sheet_reNms.append(j['subMenu'][0])
excel_reNm = creatFolder_out01+'/'+tdnow +'_MeiQingKong_quanbu.xlsx' # 경로 설정
writer = pd.ExcelWriter(excel_reNm, engine = 'xlsxwriter')
for i,j in enumerate(dfs):
j.to_excel(writer, sheet_name = sheet_reNms[i])
writer.save()
writer.close()
print("success_writer")
time.sleep(2)
drop_naValue()
# second_all()
In [ ]:
## 디폴트값
# td = datetime.today().strftime("%Y%m%d")
# tdnow = now.strftime('%Y%m%d%H%M%S')
tg_date = td # 날짜
paging = '601' # 랜덤 테스트 페이지
tg_cnt = 1 # 분야
nanugi = 1 # 추출 페이지수 지정 , 2 페이지 이상
nanugi_bijiao = 120 # 출력 갯수 , # 전체 돌릴때 적용 되는 범위지정 , nanugi_bijiao > 500: pass
rg_cnt = 7 # 분야별 for 문 돌리려는 갯수
############################################################################
start_now = int(time.time()) # 시작 시간 저장
# testNews.start_switch() # 일부분 돌리기
# testNews.start_all() # 전체 돌리기
# testNews.start_switch1() # 하나 돌리기
pool = Pool(processes=3) # 4개의 프로세스를 사용합니다.
pool.map(testNews.start_all(), '') # 실제 돌리려고 하는 함수를 넣어 줍니다.
############################################################################
ends = int(time.time()) - start_now
print('start_now, {} 초'.format(start_now))
print('end_now, {} 초'.format(ends))
driver.quit()
print('driver.quit() 되었습니다.')
'Python' 카테고리의 다른 글
200701-summarize_test_gensim,모듈화,test,bm25test (0) | 2020.07.02 |
---|---|
200701-yoyag_test_010, 요약뉴스처리,파이썬, 전처리 (0) | 2020.07.01 |
python: 200629-gensim_bm25-Copy1, BM25Okapi (0) | 2020.06.30 |
200625-yoyag_test_008-006.002, 젠심 요약본뉴스 (0) | 2020.06.25 |
python: # python 200623-yoyag_test_008-005 (0) | 2020.06.24 |
Comments