#coding=utf-8 import requests from pymongo import MongoClient from lxml import etree import datetime client = MongoClient("localhost", 27017) db = client["wanfang"] collection=db["journal_name"] collection1=db["journal_foreign_2014"] db.authenticate("","") cursor = collection.find()[1] for i in range(2645): name = cursor['name_list'][i] num = int(cursor['number_list'][i][1:-1]) mo = num%50 count = 0 if mo!=0: count = num/50 + 1 else: count = num/50 for i in range(count): url = "http://new.wanfangdata.com.cn/search/searchList.do?searchType=perio&pageSize=50&page="+str(i+1)+u"&searchWord= 摘要:is 起始年:2014 結束年:2014 刊名:" + name + "&order=correlation&showType=detail&isCheck=check&isHit=&isHitUnit=&firstAuthor=false&rangeParame=all" result = requests.post(url) html = result.text tree = etree.HTML(html) table = tree.xpath("//div[@class='title']/strong/following-sibling::*[1]/@href") for j in table: bson = {} url1 = "http://new.wanfangdata.com.cn" + j result1 = requests.post(url) html1 = result1.text time = datetime.datetime.now() bson['date'] = time bson['url'] = url1 bson['html'] = html1 bson['year'] = "2014" collection1.insert(bson)