250x250
Notice
Recent Posts
Recent Comments
일 | 월 | 화 | 수 | 목 | 금 | 토 |
---|---|---|---|---|---|---|
1 | 2 | |||||
3 | 4 | 5 | 6 | 7 | 8 | 9 |
10 | 11 | 12 | 13 | 14 | 15 | 16 |
17 | 18 | 19 | 20 | 21 | 22 | 23 |
24 | 25 | 26 | 27 | 28 | 29 | 30 |
Tags
- Topics
- Python
- r
- 토픽추출
- test
- mysql
- 지마켓
- 幼稚园杀手(유치원킬러)
- 이력서
- 과학백과사전
- 게시판 만들기
- (깃)git bash
- word2vec
- lda
- java
- jsp 파일 설정
- 크롤링
- 방식으로 텍스트
- Gmarket
- 코사인 유사도
- 파이썬
- Websocket
- oracle
- 자바
- RESFUL
- db
- pytorch
- spring MVC(모델2)방식
- tomoto
- 네이버뉴스
Archives
- Today
- Total
무회blog
200624-파이썬_004.010_headless_결측치제거 본문
In [1]:
# 200624-파이썬_004.010_headless_결측치제거
# parameter 변경 후 테스트
# No parts
# ToDayURL -> https://news.naver.com/main/list.nhn?mode=LSD&mid=sec&sid1=001&date=20200612&page=1
# Multiprocessing으로 병렬 크롤링하기
from newspaper import Article
from multiprocessing import Pool
import time ,re , timeit ,os
import pandas as pd
import requests as rq
import selenium as se
from selenium import webdriver
from bs4 import BeautifulSoup
from bs4 import SoupStrainer
from selenium import webdriver
from datetime import datetime
td = datetime.today().strftime("%Y%m%d")
tdd = datetime.today().strftime("%m%d")
now = datetime.now()
tdnow = now.strftime('%Y%m%d%H%M%S')
options = webdriver.ChromeOptions()
options.add_argument('headless')
options.add_argument('window-size=1920x1080')
options.add_argument("disable-gpu")
# 혹은 options.add_argument("--disable-gpu")
driver = webdriver.Chrome("C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe", options = options)
############ 폴더 생성 및 체크
checkFolder = os.path.isdir('./news001/'+'20'+tdd + '/')
checkFolder00 = os.path.isdir('./news002/yibufen/'+'20'+tdd + '/')
checkFolder01 = os.path.isdir('./news002/quanbu/'+'20'+tdd + '/')
creatFolder_in = './news001/'+'20'+tdd + '/'
creatFolder_out00 = './news002/yibufen/'+'20'+tdd + '/'
creatFolder_out01 = './news002/quanbu/'+'20'+tdd + '/'
if checkFolder == True:
pass
else:
os.mkdir(creatFolder_in)
if checkFolder00 == True:
pass
else:
os.mkdir(creatFolder_out00)
if checkFolder01 == True:
pass
else:
os.mkdir(creatFolder_out01)
In [2]:
# 요약봇 selector -> #main_content > div.article_header > div.article_info > div > div.article_btns > div.article_btns_right > div.media_end_head_autosummary._auto_summary_wrapper > a
slc_yoyag = '#main_content > div.article_header > div.article_info > div > div.article_btns > div.article_btns_right > div.media_end_head_autosummary._auto_summary_wrapper > a'
slc_yoyag_text = '#main_content > div.article_header > div.article_info > div > div.article_btns > div.article_btns_right > div.media_end_head_autosummary._auto_summary_wrapper > div > div.media_end_head_autosummary_layer_body > div._contents_body'
pd_link = [
'https://news.naver.com/main/list.nhn?mode=LSD&mid=sec&sid1=001' # 전체
,'https://news.naver.com/main/list.nhn?mode=LSD&mid=sec&sid1=100' # 정치속보
,'https://news.naver.com/main/list.nhn?mode=LSD&mid=sec&sid1=101' # 경제 속보
,'https://news.naver.com/main/list.nhn?mode=LSD&mid=sec&sid1=102' # 사회 속보
,'https://news.naver.com/main/list.nhn?mode=LSD&mid=sec&sid1=103' # 생활/문화 속보
,'https://news.naver.com/main/list.nhn?mode=LSD&mid=sec&sid1=104' # 세계 속보
,'https://news.naver.com/main/list.nhn?mode=LSD&mid=sec&sid1=105' # IT/과학 속보
,'https://news.naver.com/main/list.nhn?mode=LSD&mid=sec&sid1=110' # 오피니언 속보
]
pg_link = [
'&date='+td+'&page=1'
,'&date='+td+'&page=1'
,'&date='+td+'&page=1'
,'&date='+td+'&page=1'
,'&date='+td+'&page=1'
,'&date='+td+'&page=1'
,'&date='+td+'&page=1'
,'&date='+td+'&page=1'
]
pg_link
lis = []
class testNews:
def excuteFunc():
global tg_cnt # 타겟 분야
global nanugi # 몇 페이지를 돌리고 싶은가 ?
global nanugi_bijiao
global rg_cnt # 몇개 분야를 돌리는가 ?
# print('분야별 tg_cnt,{}'.format(tg_cnt))
## 타이틀 이름 가져오기
pdg = pd_link[tg_cnt] + pg_link[tg_cnt]
driver.get(pdg)
pdg_src = driver.page_source
pdg_src = BeautifulSoup(pdg_src, 'html.parser')
if tg_cnt == 0:
pdg_src = pdg_src.select('#snb > h2 > a')
pdg_src = '전체' + pdg_src[0].get_text()
pdg_src = re.sub("[-=.#/?:$}]",'_', pdg_src)
pdg_src_title = pdg_src
else:
pdg_src = pdg_src.select('#snb > ul > li.on > a')
pdg_src = pdg_src[0].get_text()
pdg_src = re.sub("[-=.#/?:$}]",'_', pdg_src)
pdg_src_title = pdg_src[:-3]
print('pdg_src_title, 분야 ---> {}'.format(pdg_src_title))
### 링크 마지막 페이지 카운트 가져오기 , 마지막 링크 가져오기
qianzhui = pd_link[tg_cnt]
tg_link = pg_link[tg_cnt] # 파트별기사 링크 &date=20200613&page=1
qian = tg_link.find('=')+1
qian1 = tg_link[:qian]
qian2_riqi = tg_date
qian3 = qian1 + qian2_riqi
hou = tg_link[len(qian3):]
hou1 = hou[:hou.find('=')+1]
hou2_yeshu = paging
hou3 = hou1 + hou2_yeshu
hou4 = hou3[:hou3.find('=')+1]
houzhui1 = qian3 + hou3
houzhui = qian3 + hou4
qh_link1 = qianzhui + houzhui1
qh_link = qianzhui + houzhui
driver.get(qh_link1)
html = driver.page_source
html = BeautifulSoup(html,'html.parser')
html = html.select('#main_content > div.paging > strong')
totalPg_cnt = int(html[0].get_text()) # 전체 페이지 갯수
#########################################################################################################################
totalPg_cnt_nanugi = int(totalPg_cnt) # 해당 페이지수만큼 가져오기
nanugi = totalPg_cnt_nanugi
li00 = []
li01 = []
li02 = []
li03 = []
li04 = []
frmNews={}
cked = 0
print('초기화용 frmNews,{} nanugi_bijiao,{}'.format(len(frmNews), nanugi_bijiao))
print('nanugi,{} totalPg_cnt, {} totalPg_cnt_nanugi, {}'.format(nanugi, totalPg_cnt , totalPg_cnt_nanugi))
#########################################################################################################################
### 원하는 페이지 만큼 돌려서 리스트에 담기
naljiBreak = True
for j in range(totalPg_cnt_nanugi):
if naljiBreak == False:
break
url = qh_link + str(j+1)
driver.get(url)
html = driver.page_source
html = BeautifulSoup(html,'html.parser')
html_body = html.body
ht_header = html_body.select('#main_content > div.list_body.newsflash_body > ul.type06_headline')
ht_header = str(ht_header)
ht_header = BeautifulSoup(ht_header,'html.parser')
ht_ul =ht_header.find_all('a')
ht_headerb = html_body.select('#main_content > div.list_body.newsflash_body > ul.type06')
ht_headerb = str(ht_headerb)
ht_headerb = BeautifulSoup(ht_headerb,'html.parser')
ht_ulb =ht_headerb.find_all('a')
llsa = []
for i in ht_ul:
tt = i.get('href')
llsa.append(tt)
for i in ht_ulb:
tt = i.get('href')
llsa.append(tt)
llsb = list(set(llsa))
llsc = llsb
# # complete ways02
for u3 in llsc:
lst_html = driver.get(u3)
lst_html = driver.page_source
lst_html = BeautifulSoup(lst_html,'html.parser')
lst_html = lst_html.body
click_yoyag = driver.find_elements_by_css_selector(slc_yoyag)
if len(click_yoyag) > 0 and click_yoyag[0].is_displayed():
_logo = lst_html(class_="press_logo")[0].find('img').get('alt')
_title = lst_html.select('#articleTitle')[0].get_text()
_content= lst_html.select('#articleBodyContents')[0].get_text().strip()
###############################################
li00.append(pdg_src_title) ###################### df 값 입력하기
li01.append(_logo)
li02.append(_title)
li03.append(_content)
click_yoyag[0].click()
time.sleep(1)
html = driver.page_source
html = BeautifulSoup(html,'html.parser')
html = html.body.select(slc_yoyag_text)
for i in html:
_press_yoyag = i.get_text()
li04.append(_press_yoyag)
cked = cked + 1
print(" j {} cked={}".format(j,cked) , end= ',')
#########################################################################################################################
# 전체 돌릴때 적용 되는 범위지정 , nanugi_bijiao
if nanugi_bijiao > 500:
pass
else: # 지정하여 사용 갯수 출력 할때
if cked >= nanugi_bijiao:
naljiBreak = False
break
#########################################################################################################################
else:
continue
frmNews = {'subMenu':li00, 'newsFrom':li01, 'title':li02, 'content':li03 , 'yoyag':li04}
df = pd.DataFrame(frmNews)
df['content']=df['content'].str.replace("// flash 오류를 우회하기 위한 함수 추가\n","").str.replace("function _flash_removeCallback()","")
df['content']=df['content'].str[3:].str.replace("\n","").str.replace("\t","").str.replace("{}","").str.strip()
excel_nm = tdnow+'_'+str(tg_cnt)+'_'+str(nanugi)+ '.xlsx'
df.to_excel(creatFolder_in +excel_nm)
print('리스트용 frmNews,{}'.format(len(frmNews)))
print('-'*50)
############################################################################
## 분야
# 0 전체 # 1 정치속보 # 2 경제 속보 # 3 사회 속보
# 4 생활/문화 속보 # 5 세계 속보 # 6 IT/과학 속보 # 7 오피니언 속보
############################################################################
## 일부분 돌리기
def start_switch():
for z in range(rg_cnt):
if z == 0 :
continue
global tg_cnt
tg_cnt = z # 분야별
print('start_switch tg_cnt, {}'.format(tg_cnt))
testNews.excuteFunc()
second_run()
############################################################################
## 전체 돌리기
def start_all():
for z in range(rg_cnt):
# # if z < 7:
if z == 0:
continue
global nanugi_bijiao
global tg_cnt
tg_cnt = z
print('start_all tg_cnt, {}'.format(tg_cnt))
testNews.excuteFunc()
second_all()
############################################################################
## 한번 돌릴때
def start_switch1():
print('start_switch1 ')
testNews.excuteFunc()
select_First()
In [3]:
############################################################################
print("#"*50)
time.sleep(2)
sheet_reNm = []
path = creatFolder_in
###########################################################################
## 결측치 값 행 빼기
def drop_naValue():
os.getcwd()
os_list = os.listdir(creatFolder_out01)
os_list.sort(reverse = True) # 내림차순
filename = os_list[0]
path1 = creatFolder_out01
path2= path1 + filename
sheet_name = 6 # 6까지 있음
print(path2)
print('-'*50)
# dfs_list = [pd.read_excel(path2 , sheet_name = x).drop_duplicates('content').dropna(axis=0) for x in range(rg_cnt-1)] # 결측값 행 빼기 # 엑셀 파일 읽기
dfs_list = [pd.read_excel(path2 , sheet_name = x).drop_duplicates('content').dropna(axis=0).head(100) for x in range(rg_cnt-1)] # 결측값 행 빼기 # 엑셀 파일 읽기
sheet_reNms = [] # 시트명 정하기
for i,j in enumerate(dfs_list):
sheet_reNms.append(j['subMenu'][0])
excel_reNm = creatFolder_out01+'/'+tdnow +'_Replace'+'_quanbu_nv.xlsx' # 경로 설정
writer = pd.ExcelWriter(excel_reNm, engine = 'xlsxwriter')
############ 메뉴명 정의
for i,j in enumerate(dfs_list):
j = j[['subMenu','newsFrom','title','content','yoyag']]
j.to_excel(writer, sheet_name = sheet_reNms[i])
writer.save()
writer.close()
print("success_writer_Replace")
path=creatFolder_out01 # creatFolder_out01 , excel_reNm
path=os.path.realpath(path)
os.startfile(path)
###########################################################################
## 한번 돌릴때 , 지정된 분야 ,
def select_First():
excel_nm = tdnow+'_'+str(tg_cnt)+'_'+str(nanugi)+'_select_First' + '.xlsx'
print(type(excel_nm))
df00 = pd.read_excel(path+excel_nm)
sheet_reNm.append(df00['subMenu'][0])
excel_wrNm = './news002/'+str(excel_nm)
print('sheet_reNm,{}'.format(sheet_reNm))
print('excel_wrNm,{}'.format(excel_wrNm))
writer = pd.ExcelWriter(excel_wrNm, engine = 'xlsxwriter')
df00['subMenu'] = sheet_reNm[0]
df00.to_excel(writer, sheet_name = sheet_reNm[0])
writer.save()
writer.close()
print("success_writer")
##################################################
# ## 일부분 돌릴때 , 성공
def second_run():
##################################################
ex_nm = os.listdir(path)
dfs = [pd.read_excel(path+x) for x in ex_nm] # 엑셀 파일 읽기
sheet_reNms = [] # 시트명 정하기
for i,j in enumerate(dfs):
sheet_reNms.append(j['subMenu'][0])
excel_reNm = './news002/yibufen/'+tdnow +'_yibufen_nv.xlsx' # 경로 설정
writer = pd.ExcelWriter(excel_reNm, engine = 'xlsxwriter')
for i,j in enumerate(dfs):
j.to_excel(writer, sheet_name = sheet_reNms[i])
writer.save()
writer.close()
print("success_writer")
##################################################
# ## 전체 돌리기 excel_nm = td+'_'+str(nanugi)+'_'+str(tg_cnt)+ '.xlsx'
def second_all():
ex_nm = os.listdir(path)
dfs = [pd.read_excel(path+x) for x in ex_nm] # 엑셀 파일 읽기
sheet_reNms = [] # 시트명 정하기
for i,j in enumerate(dfs):
sheet_reNms.append(j['subMenu'][0])
excel_reNm = creatFolder_out01+'/'+tdnow +'_meichuliqian_quanbu.xlsx' # 경로 설정
writer = pd.ExcelWriter(excel_reNm, engine = 'xlsxwriter')
for i,j in enumerate(dfs):
j.to_excel(writer, sheet_name = sheet_reNms[i])
writer.save()
writer.close()
print("success_writer")
time.sleep(2)
# second_all()
In [ ]:
## 디폴트값
# td = datetime.today().strftime("%Y%m%d")
# tdnow = now.strftime('%Y%m%d%H%M%S')
tg_date = td # 날짜
paging = '601' # 랜덤 테스트 페이지
tg_cnt = 1 # 분야
nanugi = 1 # 추출 페이지수 지정 , 2 페이지 이상
nanugi_bijiao = 120 # 출력 갯수 , # 전체 돌릴때 적용 되는 범위지정 , nanugi_bijiao > 500: pass
rg_cnt = 7 # 분야별 for 문 돌리려는 갯수
############################################################################
start_now = int(time.time()) # 시작 시간 저장
# testNews.start_switch() # 일부분 돌리기
# testNews.start_all() # 전체 돌리기
# testNews.start_switch1() # 하나 돌리기
pool = Pool(processes=3) # 4개의 프로세스를 사용합니다.
pool.map(testNews.start_all(), '') # 실제 돌리려고 하는 함수를 넣어 줍니다.
drop_naValue()
############################################################################
ends = int(time.time()) - start_now
print('start_now, {} 초'.format(start_now))
print('end_now, {} 초'.format(ends))
driver.quit()
print('driver.quit() 되었습니다.')
'Python' 카테고리의 다른 글
200623-파이썬_004.009_headless_결측치제거(성공) (0) | 2020.06.24 |
---|---|
200623-yoyag_test_008-002, 요약테스트 (0) | 2020.06.24 |
os파일생성_및_사용법 (0) | 2020.06.22 |
200619-파이썬_004.005_multiprocessing ,options_headless (0) | 2020.06.19 |
python: 200616-파이썬, writeFile_NewsContent-004.002-Copy1 (0) | 2020.06.16 |
Comments