#coding=utf-8 import time import requests from lxml import etree from pymongo import MongoClient from selenium import webdriver client = MongoClient("IP", 27017) db = client["Automobile"] collection = db["wenda_autohome"] db.authenticate("","") driver = webdriver.Chrome(executable_path=r"D:\chromedriver_win32\chromedriver.exe") def splist(l, s): return [l[i: i+s] for i in range(len(l)) if i%s==0] for i in range(36726, 40202): # url = 'https://wenda.autohome.com.cn/topic/detail/40195' url = 'https://wenda.autohome.com.cn/topic/detail/' + str(i) time.sleep(1) driver.get(url) html = driver.page_source tree = etree.HTML(html) question = tree.xpath("//h1[@class='card-title']/text()") answer_list = tree.xpath("//a[@class='text']/text()") if question==[] or answer_list==[]: continue n = 0 for j in answer_list: n += 1 answer_list[n-1] = j[41:-37] if answer_list[n-1][-3:]!='...': continue s = "//div[@class='card-reply-wrap'][" + str(n) + "]//a[@class='more']" try: driver.find_element_by_xpath(s).click() html_answer = driver.page_source tree_answer = etree.HTML(html_answer) answer_part = tree_answer.xpath("//div[@class='answer-content']/div/div[@class='ahe__area ahe__block ahe__text']/p/text()") answer = '' for item in answer_part: answer += item answer_list[n-1] = answer time.sleep(1) driver.get(url) except Exception as e: print e continue keywords = tree.xpath("//ul[@class='card-tag-list']/li/text()")
discription_list = tree.xpath("//div[@class='ahe__area ahe__block ahe__text']/p/text()") discription = '' for j in discription_list: discription += j zancai = tree.xpath("//span[@class='js-praise-count']/text()") zancai_list = splist(zancai, 2) dc = {} dc['keywords'] = keywords dc['question'] = question[0] dc['discription'] = discription dc['answer'] = answer_list dc['zancai'] = zancai_list dc['url'] = url collection.insert(dc) driver.close()