1.安裝與配置
pip install selenium
基本使用selenium都是為了動態載入網頁內容用於爬蟲,所以一般也會用到phantomjs
mac下如果要配置phantomjs環境的話
echo $PATH
ln -s <phantomjs地址> <PATH中任一路徑>
至於chromeDriver,配置方法類似,下載地址:
https://sites.google.com/a/chromium.org/chrom selenium import webdriver
2.程式碼樣例
from selenium import webdriver from selenium.common.exceptions import TimeoutException from selenium.webdriver.support.ui import WebDriverWait # available since 2.4.0 from selenium.webdriver.support import expected_conditions as EC # available since 2.26.0 # Create a new instance of the Firefox driver driver = webdriver.Firefox() # go to the google home page driver.get("http://www.google.com") # the page is ajaxy so the title is originally this: print driver.title # find the element that's name attribute is q (the google search box) inputElement = driver.find_element_by_name("q") # type in the search inputElement.send_keys("cheese!") # submit the form (although google automatically searches now without submitting) inputElement.submit() try: # we have to wait for the page to refresh, the last thing that seems to be updated is the title WebDriverWait(driver, 10).until(EC.title_contains("cheese!")) # You should see "cheese! - Google Search" print driver.title finally: driver.quit()
3.api速查
3.1定位元素
3.1.1 透過id查詢:
element = driver.find_element_by_id("coolestWidgetEvah") or from selenium.webdriver.common.by import By element = driver.find_element(by=By.ID, value="coolestWidgetEvah")
3.1.2 透過class查詢
cheeses = driver.find_elements_by_class_name("cheese") or from selenium.webdriver.common.by import By cheeses = driver.find_elements(By.CLASS_NAME, "cheese")
3.1.3 透過標籤名稱查詢
target_div = driver.find_element_by_tag_name("div") or from selenium.webdriver.common.by import By target_div = driver.find_element(By.TAG_NAME, "div")
3.1.4 透過name屬性查詢
btn = driver.find_element_by_name("input_btn") or from selenium.webdriver.common.by import By btn = driver.find_element(By.NAME, "input_btn")
3.1.5 透過連結的內容查詢
next_page = driver.find_element_by_link_text("下一頁") or from selenium.webdriver.common.by import By next_page = driver.find_element(By.LINK_TEXT, "下一頁")
3.1.6 透過連結的部分內容查詢
next_page = driver.find_element_by_partial_link_text("去下一頁") or from selenium.webdriver.common.by import By next_page = driver.find_element(By.PARTIAL_LINK_TEXT, "下一頁")
3.1.7 透過css查詢
cheese = driver.find_element_by_css_selector("#food span.dairy.aged") or from selenium.webdriver.common.by import By cheese = driver.find_element(By.CSS_SELECTOR, "#food span.dairy.aged")
3.1.8 透過xpath查詢
inputs = driver.find_elements_by_xpath("//input") or from selenium.webdriver.common.by import By inputs = driver.find_elements(By.XPATH, "//input")
3.1.9 透過js查詢
labels = driver.find_elements_by_tag_name("label") inputs = driver.execute_script( "var labels = arguments[0], inputs = []; for (var i=0; i < labels.length; i++){" + "inputs.push(document.getElementById(labels[i].getAttribute('for'))); } return inputs;", labels)
3.2 獲取元素的文字資訊
element = driver.find_element_by_id("element_id") element.text
3.3 修改userAgent
profile = webdriver.FirefoxProfile() profile.set_preference("general.useragent.override", "some UA string") driver = webdriver.Firefox(profile)
3.4 cookies
# Go to the correct domain driver.get("http://www.example.com") # Now set the cookie. Here's one for the entire domain # the cookie name here is 'key' and its value is 'value' driver.add_cookie({'name':'key', 'value':'value', 'path':'/'}) # additional keys that can be passed in are: # 'domain' -> String, # 'secure' -> Boolean, # 'expiry' -> Milliseconds since the Epoch it should expire. # And now output all the available cookies for the current URL for cookie in driver.get_cookies(): print "%s -> %s" % (cookie['name'], cookie['value']) # You can delete cookies in 2 ways # By name driver.delete_cookie("CookieName") # Or all of them driver.delete_all_cookies()
最後放一個自己的程式碼樣例好了,完成的功能為找到搜尋框輸入搜尋關鍵詞然後點選搜尋按鈕,然後開啟每個搜尋結果並且輸出網頁原始碼
# coding=utf-8 import time from selenium import webdriver from selenium.common.exceptions import TimeoutException from selenium.webdriver.support.ui import WebDriverWait # available since 2.4.0 from selenium.webdriver.support import expected_conditions as EC # available since 2.26.0 # Create a new instance of the Firefox driver driver = webdriver.Chrome() # go to the home page driver.get("http://www.zjcredit.gov.cn") #獲得當前視窗控制代碼 nowhandle = driver.current_window_handle print driver.title # find the element that's name attribute is qymc (the search box) inputElement = driver.find_element_by_name("qymc") print inputElement # type in the search inputElement.send_keys(u"同花順") driver.find_element_by_name("imageField").click(); # submit the form (compare with google we can found that the search is not a standard form and can not be submitted, we do click instead) # inputElement.submit() try: # overlap will happen if we do not move the page to the bottom # the last link will be under another unrelevant link if we do not scroll to the bottom driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") #find all link and click them for item in driver.find_elements_by_xpath('//*[@id="pagetest2"]/div/table/tbody/tr/td/a'): item.click() time.sleep(10) #獲取所有視窗控制代碼 allhandles=driver.window_handles #在所有視窗中查詢新開的視窗 for handle in allhandles: if handle!=nowhandle: #這兩步是在彈出視窗中進行的操作,證明我們確實進入了 driver.switch_to_window(handle) print driver.page_source #返回到主視窗頁面 driver.switch_to_window(nowhandle) finally: driver.quit()