爬科學基金共享服務網中基金資料
#coding=utf-8 import json import requests from lxml import etree from HTMLParser import HTMLParser from pymongo import MongoClient data = {'pageSize':10,'currentPage':1,'fundingProject.projectNo':'','fundingProject.name':'','fundingProject.person':'','fundingProject.org':'', 'fundingProject.applyCode':'','fundingProject.grantCode':'','fundingProject.subGrantCode':'','fundingProject.helpGrantCode':'','fundingProject.keyword':'', 'fundingProject.statYear':'','checkCode':'%E8%AF%B7%E8%BE%93%E5%85%A5%E9%AA%8C%E8%AF%81%E7%A0%81'} url = 'http://npd.nsfc.gov.cn/fundingProjectSearchAction!search.action' headers = {'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding':'gzip, deflate', 'Accept-Language':'zh-CN,zh;q=0.9', 'Cache-Control':'max-age=0', 'Connection':'keep-alive', 'Content-Length':'340', 'Content-Type':'application/x-www-form-urlencoded', 'Cookie':'JSESSIONID=8BD27CE37366ED8022B42BFC68FF82D4', 'Host':'npd.nsfc.gov.cn', 'Origin':'http://npd.nsfc.gov.cn', 'Referer':'http://npd.nsfc.gov.cn/fundingProjectSearchAction!search.action', 'Upgrade-Insecure-Requests':'1', 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'} def main(): client = MongoClient('localhost', 27017) db = client.ScienceFund db.authenticate("","") collection=db.science_fund for i in range(1, 43184): print i data['currentPage'] = i result = requests.post(url, data = data, headers = headers) html = result.text tree = etree.HTML(html) table = tree.xpath("//dl[@class='time_dl']") for item in table: content = etree.tostring(item, method='html') content = HTMLParser().unescape(content) # print content bson = jiexi(content) collection.insert(bson) def jiexi(content): # 標題 title1 = content.find('">', 20) title2 = content.find('</') title = content[title1+2:title2] # print title # 批准號 standard_no1 = content.find(u'批准號', title2) standard_no2 = content.find('</dd>', standard_no1) standard_no = content[standard_no1+4:standard_no2].strip() # print standard_no # 專案類別 standard_type1 = content.find(u'專案類別', standard_no2) standard_type2 = content.find('</dd>', standard_type1) standard_type = content[standard_type1+5:standard_type2].strip() # print standard_type # 依託單位 supporting_institution1 = content.find(u'依託單位', standard_type2) supporting_institution2= content.find('</dd>', supporting_institution1) supporting_institution = content[supporting_institution1+5:supporting_institution2].strip() # print supporting_institution # 專案負責人 project_principal1 = content.find(u'專案負責人', supporting_institution2) project_principal2 = content.find('</dd>', project_principal1) project_principal = content[project_principal1+6:project_principal2].strip() # print project_principal # 資助經費 funds1 = content.find(u'資助經費', project_principal2) funds2 = content.find('</dd>', funds1) funds = content[funds1+5:funds2].strip() # print funds # 批准年度 year1 = content.find(u'批准年度', funds2) year2 = content.find('</dd>', year1) year = content[year1+5:year2].strip() # print year # 關鍵詞 keywords1 = content.find(u'關鍵詞', year2) keywords2 = content.find('</dd>', keywords1) keywords = content[keywords1+4:keywords2].strip() # print keywords dc = {} dc['title'] = title dc['standard_no'] = standard_no dc['standard_type'] = standard_type dc['supporting_institution'] = supporting_institution dc['project_principal'] = project_principal dc['funds'] = funds dc['year'] = year dc['keywords'] = keywords return dc if __name__ == '__main__': main()