2016年
#coding=utf-8 import re from pymongo import MongoClient client = MongoClient("localhost", 27017) db = client["nosta"] collection1 = db["nosta_2016"] collection2 = db["2016_list"] db.authenticate("zty","zty") n = 0 for item in collection1.find({}, {"project_name":1, "project_content":1, "_id":0}): n += 1 print n if item['project_content'].has_key(u'主要完成人'): ls = item['project_content'][u'主要完成人'] if ls: for line in ls: # print line matchObj1 = re.search( ur'(姓名:.*?) .*', line) matchObj2 = re.search( ur'.* (行政職務:.*?) .*', line) matchObj3 = re.search( ur'.* (技術職稱:.*?) .*', line) matchObj4 = re.search( ur'.* (工作單位:.*?) .*', line) matchObj5 = re.search( ur'.* (對本專案技術創造性貢獻:.*?) .*', line) matchObj6 = re.search( ur'.* (對本專案主要學術貢獻:.*?) .*', line) matchObj7 = re.search( ur'.* (曾獲國家科技獎勵情況:.*)', line) dc = {} dc['project_name'] = item['project_name'] dc['name'] = matchObj1.group(1) if matchObj1 else '' dc['duty'] = matchObj2.group(1) if matchObj2 else '' dc['title'] = matchObj3.group(1) if matchObj3 else '' dc['unit'] = matchObj4.group(1) if matchObj4 else '' dc['contribution'] = matchObj5.group(1) if matchObj5 else '' if dc['contribution']=='': dc['contribution'] = matchObj6.group(1) if matchObj6 else '' dc['award'] = matchObj7.group(1) if matchObj7 else '' # for k, v in dc.items(): # print k, v collection2.insert(dc)
2017、2018年
#coding=utf-8 import re from pymongo import MongoClient client = MongoClient("localhost", 27017) db = client["nosta"] collection1 = db["nosta_2017"] collection2 = db["2017_list"] db.authenticate("zty","zty") n = 0 for item in collection1.find({}, {"project_name":1, "project_content":1, "_id":0}): n += 1 print n if item['project_content'].has_key(u'主要完成人:'): choice = item['project_content'][u'主要完成人:'] if choice == []: continue ls = choice.split(u'姓名:')[1:] for line in ls: line = line.replace(u'排名:',u' 排名:') line = line.replace(u'行政職務:',u' 行政職務:') line = line.replace(u'技術職稱:',u' 技術職稱:') line = line.replace(u'工作單位:',u' 工作單位:') line = line.replace(u'完成專案時所在單位:',u' 完成專案時所在單位:') line = line.replace(u'對本專案技術創造性貢獻:',u' 對本專案技術創造性貢獻:') line = line.replace(u'對本專案主要學術貢獻:',u' 對本專案主要學術貢獻:') line = line.replace(u'曾獲國家科技獎勵情況:',u' 曾獲國家科技獎勵情況:') line = u'姓名:' + line # print line matchObj1 = re.search( ur'(姓名:.*?) .*', line) matchObj2 = re.search( ur'.* (行政職務:.*?) .*', line) matchObj3 = re.search( ur'.* (技術職稱:.*?) .*', line) matchObj4 = re.search( ur'.* (工作單位:.*?) .*', line) matchObj5 = re.search( ur'.* (對本專案技術創造性貢獻:.*?) .*', line) matchObj6 = re.search( ur'.* (對本專案主要學術貢獻:.*?) .*', line) matchObj7 = re.search( ur'.* (曾獲國家科技獎勵情況:.*)', line) dc = {} dc['project_name'] = item['project_name'] dc['name'] = matchObj1.group(1) if matchObj1 else '' dc['duty'] = matchObj2.group(1) if matchObj2 else '' dc['title'] = matchObj3.group(1) if matchObj3 else '' dc['unit'] = matchObj4.group(1) if matchObj4 else '' dc['contribution'] = matchObj5.group(1) if matchObj5 else '' if dc['contribution']=='': dc['contribution'] = matchObj6.group(1) if matchObj6 else '' dc['award'] = matchObj7.group(1) if matchObj7 else '' # for k, v in dc.items(): # print k, v collection2.insert(dc)