爬蟲Selenium+PhantomJS爬取動態網站圖片資訊(Python)

LIJZ_Python發表於2018-03-24
from urllib import request, error
from requests import RequestException

import lijzMD5
from lijzLog import *
import requests
import json, re, time, random, os
from selenium import webdriver
from bs4 import BeautifulSoup



UserAgentList = [
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60",
    "Opera/8.0 (Windows NT 5.1; U; en)",
    "Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50",
    "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 9.50",
    "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0",
    "Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2 ",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11",
    "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36",
    "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
    "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER) ",
    "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)",
    "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)",
    "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E) ",
    "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; SE 2.X MetaSr 1.0)",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36",
    "Mozilla/5.0 (Linux; U; Android 2.2.1; zh-cn; HTC_Wildfire_A3333 Build/FRG83D) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
    "Mozilla/5.0 (Linux; U; Android 2.3.7; en-us; Nexus One Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
    "MQQBrowser/26 Mozilla/5.0 (Linux; U; Android 2.3.7; zh-cn; MB200 Build/GRJ22; CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
    "Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13",
    "Mozilla/5.0 (BlackBerry; U; BlackBerry 9800; en) AppleWebKit/534.1+ (KHTML, like Gecko) Version/6.0.0.337 Mobile Safari/534.1+",
    "Mozilla/5.0 (hp-tablet; Linux; hpwOS/3.0.0; U; en-US) AppleWebKit/534.6 (KHTML, like Gecko) wOSBrowser/233.70 Safari/534.6 TouchPad/1.0",
    "Mozilla/5.0 (SymbianOS/9.4; Series60/5.0 NokiaN97-1/20.0.019; Profile/MIDP-2.1 Configuration/CLDC-1.1) AppleWebKit/525 (KHTML, like Gecko) BrowserNG/7.1.18124",
    "Mozilla/5.0 (compatible; MSIE 9.0; Windows Phone OS 7.5; Trident/5.0; IEMobile/9.0; HTC; Titan)",
    "UCWEB7.0.2.37/28/999",
    "NOKIA5700/ UCWEB7.0.2.37/28/999",
    "Openwave/ UCWEB7.0.2.37/28/999",
    "Mozilla/4.0 (compatible; MSIE 6.0; ) Opera/UCWEB7.0.2.37/28/999"
]


def __singletion(cls):
    """
    單例模式的裝飾器函式
    :param cls: 實體類
    :return: 返回實體類物件
    """
    instances = {}
    def getInstance(*args, **kwargs):
        if cls not in instances:
            instances[cls] = cls(*args, **kwargs)
        return instances[cls]
    return getInstance
@__singletion
class C_SeleniumPhantomJSCrawler(object):
    def __init__(self, url, browserPath, outPath, parser='html5lib'):
        self.url = url
        self.browserPath = browserPath
        self.outPath = outPath
        self.parser = parser
        self.logger = C_Logger('SeleniumPhantomJSCrawler', 'SeleniumPhantomJSCrawler.log', out=1).getLogger()

    def __makeDir(self, path):
        isExists = os.path.exists(path)
        if not isExists:
            try:
                os.makedirs(path)
            except NotImplementedError:
                self.logger.error("makedir %s is Error" % path)
                return False
        return True

    def startCrawler(self):
        self.__makeDir(self.outPath)
        driver = webdriver.Chrome(executable_path=self.browserPath)
        driver.get(self.url)
        bsObj = BeautifulSoup(driver.page_source, self.parser)
        girlsList = driver.find_element_by_id('J_GirlsList').text.split('\n')
        imagesUrl = re.findall('\/\/gtd\.alicdn\.com\/sns_logo.*\.jpg', driver.page_source)
        girlsUrl = bsObj.find_all("a", {"href": re.compile("\/\/.*\.htm\?(userId=)\d*")})
        # 所有妹子的名字地點
        girlsNL = girlsList[::3]
        # 所有妹子的身高體重
        girlsHW = girlsList[1::3]
        # 所有妹子的個人主頁地址
        girlsHURL = [('http:' + i['href']) for i in girlsUrl]
        # 所有妹子的封面圖片地址
        girlsPhotoURL = [('https:' + i) for i in imagesUrl]
        # 姓名地址 girlNL,        身高體重 girlHW
        # 個人主頁地址 girlHRUL,   封面圖片 URL
        girlsInfo = zip(girlsNL, girlsHW, girlsHURL, girlsPhotoURL)
        for girlNL, girlHW, girlHURL, girlCover in girlsInfo:
            # 為妹子建立資料夾
            self.__makeDir(self.outPath + girlNL)
            # 獲取妹子封面圖片
            data = request.urlopen(girlCover).read()
            with open(self.outPath + girlNL + '/cover.jpg', 'wb') as f:
                f.write(data)
            # 獲取妹子個人主頁中的圖片
            self.__getImgs(girlHURL, self.outPath + girlNL)
        driver.close()

    def __getImgs(self,url,  path):
        driver = webdriver.Chrome(executable_path=self.browserPath)
        driver.get(url)
        bsObj = BeautifulSoup(driver.page_source, self.parser)
        imgs = bsObj.find_all("img", {"src": re.compile(".*\.jpg")})
        for i, img in enumerate(imgs[1:]):
            try:
                data = request.urlopen("http:" + img['src']).read()
                names = lijzMD5.md5_uuid()[:9]
                fileName = path + "/" + names + "_" + str(i+1) + ".jpg"
                with open(fileName, 'wb') as f:
                    f.write(data)
            except:
                self.logger.error("請求地址錯誤!")
        driver.close()

測試方式如下:

from lijzCrawler import *


if __name__ == '__main__':
    url = "https://mm.taobao.com/search_tstar_model.htm?"
    browserPath = "C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe"
    spider = C_SeleniumPhantomJSCrawler(url, browserPath, 'photo/')
    spider.startCrawler()



相關文章