爬取外文工業技術期刊名稱

右介發表於2018-03-09

為接下來按刊名爬取期刊論文做準備。

from pymongo import MongoClient
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
from selenium import webdriver
client = MongoClient("IP", 27017)
db = client["wanfang"]
collection=db["journal_name"]
db.authenticate("","")
driver=webdriver.Chrome("D:\spider\cnki\driver\chromedriver.exe")
driver.maximize_window()
driver.get("http://new.wanfangdata.com.cn/search/searchList.do?searchType=perio&searchWord=%20%E6%91%98%E8%A6%81:is%20%E8%B5%B7%E5%A7%8B%E5%B9%B4:2014%20%E7%BB%93%E6%9D%9F%E5%B9%B4:2014&facetField=subject_classcode_level:%E2%88%B7/T*$subject_classcode_level:%E2%88%B7/T*$source_db:NSTL%E2%88%B7&showType=detail&pageSize=50&facetName=%E5%B7%A5%E4%B8%9A%E6%8A%80%E6%9C%AF:$subject_classcode_level*NSTL:$source_db&isHit=&isHitUnit=&firstAuthor=false&rangeParame=all")

wait = WebDriverWait(driver, 20)

time.sleep(5)
element=wait.until(EC.visibility_of_element_located((By.ID,"mainNode_5")))
element.click()
i = 0
xpathStr = "//ul[@class='x_navBox']/li[5]"
while True:        
    print i
    i+=1
    xpathStr = xpathStr+"/li"    
    time.sleep(2)    
    try:
        element = wait.until(EC.visibility_of_element_located((By.XPATH,xpathStr+"/input")))
        #element = driver.find_element_by_xpath(xpathStr+"/input")
        element.click()
    except:
        html = driver.page_source
        #element = driver.find_element_by_xpath(xpathStr+"/ul")
        collection.insert({"html":html})
        break

 

相關文章