from urllib import request, error
from requests import RequestException
import lijzMD5
from lijzLog import *
import requests
import json, re, time, random, os
from selenium import webdriver
from bs4 import BeautifulSoup
UserAgentList = [
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60",
"Opera/8.0 (Windows NT 5.1; U; en)",
"Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 9.50",
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0",
"Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2 ",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11",
"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER) ",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E) ",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; SE 2.X MetaSr 1.0)",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36",
"Mozilla/5.0 (Linux; U; Android 2.2.1; zh-cn; HTC_Wildfire_A3333 Build/FRG83D) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
"Mozilla/5.0 (Linux; U; Android 2.3.7; en-us; Nexus One Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
"MQQBrowser/26 Mozilla/5.0 (Linux; U; Android 2.3.7; zh-cn; MB200 Build/GRJ22; CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
"Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13",
"Mozilla/5.0 (BlackBerry; U; BlackBerry 9800; en) AppleWebKit/534.1+ (KHTML, like Gecko) Version/6.0.0.337 Mobile Safari/534.1+",
"Mozilla/5.0 (hp-tablet; Linux; hpwOS/3.0.0; U; en-US) AppleWebKit/534.6 (KHTML, like Gecko) wOSBrowser/233.70 Safari/534.6 TouchPad/1.0",
"Mozilla/5.0 (SymbianOS/9.4; Series60/5.0 NokiaN97-1/20.0.019; Profile/MIDP-2.1 Configuration/CLDC-1.1) AppleWebKit/525 (KHTML, like Gecko) BrowserNG/7.1.18124",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows Phone OS 7.5; Trident/5.0; IEMobile/9.0; HTC; Titan)",
"UCWEB7.0.2.37/28/999",
"NOKIA5700/ UCWEB7.0.2.37/28/999",
"Openwave/ UCWEB7.0.2.37/28/999",
"Mozilla/4.0 (compatible; MSIE 6.0; ) Opera/UCWEB7.0.2.37/28/999"
]
def __singletion(cls):
"""
單例模式的裝飾器函式
:param cls: 實體類
:return: 返回實體類物件
"""
instances = {}
def getInstance(*args, **kwargs):
if cls not in instances:
instances[cls] = cls(*args, **kwargs)
return instances[cls]
return getInstance
@__singletion
class C_SeleniumPhantomJSCrawler(object):
def __init__(self, url, browserPath, outPath, parser='html5lib'):
self.url = url
self.browserPath = browserPath
self.outPath = outPath
self.parser = parser
self.logger = C_Logger('SeleniumPhantomJSCrawler', 'SeleniumPhantomJSCrawler.log', out=1).getLogger()
def __makeDir(self, path):
isExists = os.path.exists(path)
if not isExists:
try:
os.makedirs(path)
except NotImplementedError:
self.logger.error("makedir %s is Error" % path)
return False
return True
def startCrawler(self):
self.__makeDir(self.outPath)
driver = webdriver.Chrome(executable_path=self.browserPath)
driver.get(self.url)
bsObj = BeautifulSoup(driver.page_source, self.parser)
girlsList = driver.find_element_by_id('J_GirlsList').text.split('\n')
imagesUrl = re.findall('\/\/gtd\.alicdn\.com\/sns_logo.*\.jpg', driver.page_source)
girlsUrl = bsObj.find_all("a", {"href": re.compile("\/\/.*\.htm\?(userId=)\d*")})
# 所有妹子的名字地點
girlsNL = girlsList[::3]
# 所有妹子的身高體重
girlsHW = girlsList[1::3]
# 所有妹子的個人主頁地址
girlsHURL = [('http:' + i['href']) for i in girlsUrl]
# 所有妹子的封面圖片地址
girlsPhotoURL = [('https:' + i) for i in imagesUrl]
# 姓名地址 girlNL, 身高體重 girlHW
# 個人主頁地址 girlHRUL, 封面圖片 URL
girlsInfo = zip(girlsNL, girlsHW, girlsHURL, girlsPhotoURL)
for girlNL, girlHW, girlHURL, girlCover in girlsInfo:
# 為妹子建立資料夾
self.__makeDir(self.outPath + girlNL)
# 獲取妹子封面圖片
data = request.urlopen(girlCover).read()
with open(self.outPath + girlNL + '/cover.jpg', 'wb') as f:
f.write(data)
# 獲取妹子個人主頁中的圖片
self.__getImgs(girlHURL, self.outPath + girlNL)
driver.close()
def __getImgs(self,url, path):
driver = webdriver.Chrome(executable_path=self.browserPath)
driver.get(url)
bsObj = BeautifulSoup(driver.page_source, self.parser)
imgs = bsObj.find_all("img", {"src": re.compile(".*\.jpg")})
for i, img in enumerate(imgs[1:]):
try:
data = request.urlopen("http:" + img['src']).read()
names = lijzMD5.md5_uuid()[:9]
fileName = path + "/" + names + "_" + str(i+1) + ".jpg"
with open(fileName, 'wb') as f:
f.write(data)
except:
self.logger.error("請求地址錯誤!")
driver.close()
測試方式如下:
from lijzCrawler import *
if __name__ == '__main__':
url = "https://mm.taobao.com/search_tstar_model.htm?"
browserPath = "C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe"
spider = C_SeleniumPhantomJSCrawler(url, browserPath, 'photo/')
spider.startCrawler()