Python 爬取汽車領域問答語料(自用)

右介發表於2018-08-06
#coding=utf-8

import time
import requests
from lxml import etree
from pymongo import MongoClient
from selenium import webdriver

client = MongoClient("IP", 27017)
db = client["Automobile"]
collection = db["wenda_autohome"]
db.authenticate("","")

driver = webdriver.Chrome(executable_path=r"D:\chromedriver_win32\chromedriver.exe")

def splist(l, s):
    return [l[i: i+s] for i in range(len(l)) if i%s==0]

for i in range(36726, 40202):  
    # url = 'https://wenda.autohome.com.cn/topic/detail/40195'
    url = 'https://wenda.autohome.com.cn/topic/detail/' + str(i)

    time.sleep(1)
    driver.get(url)
    html = driver.page_source
    tree = etree.HTML(html)

    question = tree.xpath("//h1[@class='card-title']/text()")
    answer_list = tree.xpath("//a[@class='text']/text()")
    if question==[] or answer_list==[]:
        continue

    n = 0
    for j in answer_list:
        
        n += 1
        answer_list[n-1] = j[41:-37]
        if answer_list[n-1][-3:]!='...':
            continue

        s = "//div[@class='card-reply-wrap'][" + str(n) + "]//a[@class='more']"
        try:
            driver.find_element_by_xpath(s).click()
            
            html_answer = driver.page_source
            tree_answer = etree.HTML(html_answer)
            answer_part = tree_answer.xpath("//div[@class='answer-content']/div/div[@class='ahe__area ahe__block ahe__text']/p/text()")
            answer = ''
            for item in answer_part:
                answer += item

            answer_list[n-1] = answer
            time.sleep(1)
            driver.get(url)
        except Exception as e:
            print e 
            continue

    keywords = tree.xpath("//ul[@class='card-tag-list']/li/text()")
discription_list
= tree.xpath("//div[@class='ahe__area ahe__block ahe__text']/p/text()") discription = '' for j in discription_list: discription += j zancai = tree.xpath("//span[@class='js-praise-count']/text()") zancai_list = splist(zancai, 2) dc = {} dc['keywords'] = keywords dc['question'] = question[0] dc['discription'] = discription dc['answer'] = answer_list dc['zancai'] = zancai_list dc['url'] = url collection.insert(dc) driver.close()

 

相關文章