爬取市場價格,全國農產品商務資訊公共服務平臺

資料小蝸Leon發表於2020-12-27

宣告:程式碼僅作學習交流用途,程式碼分享者與創作者不承擔任何由他人惡意執行而導致的責任,勿擅自修改限制頻率的引數,勿惡意攻擊網頁,請學習瀏覽者遵守社會公德與法律秩序,爬蟲導致的網頁崩潰等損失由計算機操作者負全部責任,造成嚴重後果的需要承擔刑事責任
全國農產品商務資訊公共服務平臺爬取

import requests
from fake_useragent import UserAgent
from lxml import etree
from time import sleep
from random import randint
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
#from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from multiprocessing import  Process
import threading
import re
from tqdm import tqdm
from selenium.webdriver.chrome.options import Options
#url_base = 'https://nc.mofcom.gov.cn/jghq/priceList?craftName=%E7%8C%AA%E8%82%89'豬肉
#url_base = 'https://nc.mofcom.gov.cn/jghq/priceList?craftName=%E7%BE%8A%E8%82%89'羊肉
#url_base = 'https://nc.mofcom.gov.cn/jghq/priceList?craftName=%E7%8E%89%E7%B1%B3'#玉米
#url_base = 'https://nc.mofcom.gov.cn/jghq/priceList?craftName=%E7%99%BD%E6%9D%A1%E9%B8%A1'#雞肉
url_base = 'https://nc.mofcom.gov.cn/jghq/priceList?craftName=%E9%B8%A1%E8%9B%8B'#雞蛋
options = Options()
UA = UserAgent().edge
options.add_argument('''user-agent='{}' '''.format(UA))
#   options.add_argument('''proxy-server={}'''.format(proxy))  # 124.236.111.11:80
options.binary_location = "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe"
edge = webdriver.Chrome(options=options)  # executable_path="D:\Program Files\python3.7\chromedriver.exe"
edge.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
    "source": """
           Object.defineProperty(navigator, 'webdriver',{
           get: () => undefined
           })
           """
})
edge = webdriver.Chrome(options=options)
edge.get(url_base)
edge.find_element(By.XPATH, '//*[@id="eudName"]').click()
edge.find_element(By.XPATH, '/html/body/div[3]/div[1]/a[2]').click()
edge.find_element(By.XPATH, '/html/body/div[3]/div[2]/ul[2]/li[2]').click()
edge.find_element(By.XPATH, '//*[@id="searchForm"]/div/div[3]/div[1]/div/input').click()
#edge.find_element(By.XPATH, '//*[@id="searchForm"]/div/div[3]/div[1]/dl/dd[4]').click()
edge.find_element(By.XPATH, '//*[@id="searchForm"]/div/div[3]/div[1]/dl/dd[3]').click()

#edge.find_element(By.XPATH, '//*[@id="layui-laydate1"]/div[1]/div[2]/table/tbody/tr[1]/td[6]').click()
#edge.find_element(By.XPATH, '//*[@id="layui-laydate1"]/div[2]/div[2]/table/tbody/tr[1]/td[2]').click()
edge.find_element(By.XPATH, '//*[@id="searchBtn"]').click()
data_all = []
product_all = []
price_all = []
market_all=[]
sleep(2)
while True:
    html = edge.page_source
    e = etree.HTML(html)

    data = e.xpath('''//table[@class='table-01 mt30']/tbody[1]/tr/td[1]/text()''')
    product = e.xpath('''//table[@class='table-01 mt30']/tbody[1]/tr/td[2]/span/text()''')
    price = e.xpath('''//*[@id="showList"]/table/tbody/tr/td[3]/span/text()''')
    market = e.xpath('''//*[@id="showList"]/table/tbody/tr/td[4]/a/text()''')
    print(data)
    data_all = data_all + data
    product_all = product_all + product
    price_all = price_all + price
    market_all = market_all + market
    print(data_all)
    sleep(5)
    if e.xpath('''//*[@id="pageFooter"]/a[last()-1]/text()''')  == ['下一頁']:
        edge.find_element(By.XPATH, '''/html/body/div[2]/div/div[1]/div[3]/a[last()-1]''').click()#//*[@id="pageFooter"]/a[9]#/html/body/div[2]/div/div[1]/div[3]/a[9]
    else:
        break
  #      edge.find_element(By.XPATH, '''//*[@id="pageFooter"]/a[last()-1]''').click()
all_info = {
            '資料年月': data_all,
            '產品': product_all,
            '價格': price_all,
            '市場': market_all
        }
outdata = pd.DataFrame(all_info)
outdata.to_csv('C:\\Users\\Admin\\PycharmProjects\\untitled\\雞蛋價格.csv', encoding='GBK')

相關文章