무회blog

python: 20200520-ver06_Premium_python # 20200520-Gmarket-댓글수 엑셀 성공함 본문

Python

python: 20200520-ver06_Premium_python # 20200520-Gmarket-댓글수 엑셀 성공함

최무회 2020. 5. 20. 16:17
# 20200520-Gmarket-ver06_Premium댓글수 엑셀 성공함
import re
import time
import pandas as pd
import requests as rq
from bs4 import BeautifulSoup
import selenium as se
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from datetime import datetime

driver  = webdriver.Chrome("C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe") 
PAUSE_TIME = 1.5
inpts = input()

prttx      = []                                   # 프리미엄 상품평
prcnt      = []                                   # 프리미엄 상품평 count
prContent  = []                                   # 프리미엄 Content
prInfo     = []                                   # 프리미엄 Info
def startGmarket():
    
    url = 'https://www.gmarket.co.kr/'
    driver.get(url)
    search = driver.find_element_by_css_selector('#skip-navigation-search > span > input')
    search.send_keys(inpts)
    search = driver.find_element_by_xpath('//*[@id="skip-navigation-search"]/span/button')
    search.click()
    driver.implicitly_wait(20)
    time.sleep(PAUSE_TIME)
    groupby = driver.find_element_by_xpath('//*[@id="region__content-status-information"]/div/div/div[2]/div[1]/div[1]/button')
    groupby.click()
    driver.implicitly_wait(20)
    time.sleep(PAUSE_TIME)
    dianjishu = driver.find_element_by_css_selector('#region__content-status-information > div > div > div.box__control-area > div.box__sort-control.box__sort-control--active > div.box__sort-control-list > ul > li:nth-child(5) > a')
    dianjishu.click()
    driver.implicitly_wait(20)
    html = driver.page_source
    html = BeautifulSoup(html,'html.parser')
    html = html.select('div.box__item-container')
    html = html[0].find('a').get('href')
    html = str(html)
    url1 = html
    driver.get(url1)
    driver.execute_script("window.scrollTo(0,800);")
    time.sleep(PAUSE_TIME)
    test = driver.find_element_by_xpath('//*[@id="container"]/div[3]/div[1]/ul/li[2]')
    driver.implicitly_wait(10)
    test.click()

def cm_review_alFeedback():
    alFeedback = driver.page_source
    alFeedback = BeautifulSoup(alFeedback,'html.parser')
    alFeedback = alFeedback.body.select('#container #vip-tab_comment #review-wrapper')
    review = str(alFeedback)
    review = BeautifulSoup(review,'html.parser')
    return review

def cm_appendList():
    review = cm_review_alFeedback()
    premium_title = review.select('#review-wrapper > h3')
    premium_count = review.select('#review-wrapper > h3 > span')
    premium_reTbody = str(review.select('#premium-wrapper > table > tbody > tr'))
    premium_reTbody= BeautifulSoup(premium_reTbody,'html.parser')
    premium_td_content = premium_reTbody.select('td.comment-content')
    premium_td_info = premium_reTbody.select('td.info')
    premium_Page      = review.select('#premium-pagenation-wrap > div.board_pagenation > ul > li')
    premium_PageNext  = review.select('#premium-pagenation-wrap > div.board_pagenation > a.next > span > em')
    premium_pagetotal = review.select('#premium-pagenation-wrap .pagetotal')
    ccnt = len(premium_td_info)
    for i in range(ccnt):
        prttx.append(premium_title[0].get_text()[:8])               # 프리미엄 상품평
        prcnt.append(premium_count[0].get_text())                   # 프리미엄 상품평 count
        prContent.append(premium_td_content[i].get_text())          # 프리미엄 Content
        prInfo.append(premium_td_info[i].get_text())                # 프리미엄 Info
        
    
def get_prPageCnt():  # get_prPageCnt
    review = cm_review_alFeedback()
    premium_Page      = review.select('#premium-pagenation-wrap > div.board_pagenation > ul > li')
    pr_PgCnt = len(premium_Page)
    print(pr_PgCnt)
    return pr_PgCnt

def getNextPage():
    pr_PgCnt = get_prPageCnt()
    pr_PageCnt = int(pr_PgCnt)
    for i in range(pr_PageCnt):
        test = pr_PageCnt - 1
        if i < test:
            ye = 2+i
            ye = str(ye)
            cm_appendList()
            nst = driver.find_element_by_xpath('//*[@id="premium-pagenation-wrap"]/div[1]/ul/li['+ye+']/a')
            nst.send_keys(Keys.ENTER)
            time.sleep(0.5)
            print(i)
        else:
            cm_appendList()
            
def pr_NextPartClick():
    getNextPage()
    first_page = driver.page_source
    first_page = BeautifulSoup(first_page,'html.parser')
    first_page = first_page.body.select('#premium-pagenation-wrap > div.board_paging > span > em')[0].get_text()
    first_page = int(first_page)
    fi = int(first_page/10)
    print('-'*30+'fi:')
    print(fi)
    if fi > 0:
        for i in range(fi):
            driver.execute_script("window.scrollTo(0,2000);")
            fanye = driver.find_element_by_css_selector('#premium-pagenation-wrap > div.board_pagenation > a.next')
            driver.execute_script("arguments[0].click();",fanye)
            time.sleep(0.5)
            getNextPage()
    else:
        print('ending')
        return
    
    
startGmarket()
pr_NextPartClick()

dic={}
dic['prttx']=prttx
dic['prcnt']=prcnt
dic['prContent']=prContent
dic['prInfo']=prInfo

td = datetime.today().strftime("%Y-%m-%d")
excel_nm = td +'_'+ inpts +'_Premium댓글수_'+prcnt[0]+'.xlsx'

df01 = pd.DataFrame(dic) 
df01.to_excel('./output/'+excel_nm)
df01
print('#'*30+'prInfo: ')
print('success001')
Comments