為接下來按刊名爬取期刊論文做準備。
from pymongo import MongoClient from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC import time from selenium import webdriver client = MongoClient("IP", 27017) db = client["wanfang"] collection=db["journal_name"] db.authenticate("","") driver=webdriver.Chrome("D:\spider\cnki\driver\chromedriver.exe") driver.maximize_window() driver.get("http://new.wanfangdata.com.cn/search/searchList.do?searchType=perio&searchWord=%20%E6%91%98%E8%A6%81:is%20%E8%B5%B7%E5%A7%8B%E5%B9%B4:2014%20%E7%BB%93%E6%9D%9F%E5%B9%B4:2014&facetField=subject_classcode_level:%E2%88%B7/T*$subject_classcode_level:%E2%88%B7/T*$source_db:NSTL%E2%88%B7&showType=detail&pageSize=50&facetName=%E5%B7%A5%E4%B8%9A%E6%8A%80%E6%9C%AF:$subject_classcode_level*NSTL:$source_db&isHit=&isHitUnit=&firstAuthor=false&rangeParame=all") wait = WebDriverWait(driver, 20) time.sleep(5) element=wait.until(EC.visibility_of_element_located((By.ID,"mainNode_5"))) element.click() i = 0 xpathStr = "//ul[@class='x_navBox']/li[5]" while True: print i i+=1 xpathStr = xpathStr+"/li" time.sleep(2) try: element = wait.until(EC.visibility_of_element_located((By.XPATH,xpathStr+"/input"))) #element = driver.find_element_by_xpath(xpathStr+"/input") element.click() except: html = driver.page_source #element = driver.find_element_by_xpath(xpathStr+"/ul") collection.insert({"html":html}) break