#coding=utf-8 from pymongo import MongoClient from lxml import etree import requests jigou = u"\r\n 【機構】\r\n " zuozhe = u"\r\n 【作者】\r\n " # 獲取資料庫 def get_db(): client = MongoClient('localhost', 27017) db = client.cnki db.authenticate("使用者名稱","密碼") return db # 獲取第num條資料 def get_data(table, num): i = 1 for item in table.find({}, {"html":1,"_id":0}): if i==num: if item.has_key('html') and item['html']: return item['html'] else: i+=1 continue # 列表首元素轉字串 def list_str(list): if len(list)!=0: return list[0] else: return "" # 作者英文名,機構英文名 def en_ls(list, length1, length2): if len(list)!=0: list = list[0].replace(u"【Author】","").replace("\r\n","").strip().split(";") if len(list)==(length2+length1)+1: return list2str(list[:length1]), list2str(list[length1:-1]) else: return "", "" else: return "", "" def hyxx(list): if len(list)!=0: hylmc,hymc,hysj,hydd,flh,zbdw = "","","","",[],"" for item in list: if u"【會議錄名稱】" in item: hylmc = item.replace(u"【會議錄名稱】","").replace("\r\n","").strip() continue if u"【會議名稱】" in item: hymc = item.replace(u"【會議名稱】","").replace("\r\n","").strip() continue if u"【會議時間】" in item: hysj = item.replace(u"【會議時間】","").replace("\r\n","").strip() continue if u"【會議地點】" in item: hydd = item.replace(u"【會議地點】","").replace("\r\n","").strip() continue if u"【分類號】" in item: flh = item.replace(u"【分類號】","").replace("\r\n","").strip() continue if u"【主辦單位】" in item: zbdw = item.replace(u"【主辦單位】","").replace(u"、",";").replace("\r\n","").strip() continue return hylmc,hymc,hysj,hydd,flh,zbdw else: return "","","","","","" # 列表轉字串 def list2str(list): if len(list)!=0: return ";".join(list) else: return "" # 構造論文入庫字典 def standard_dict(html): dc = {} print 1 # print html tree = etree.HTML(html) # 論文名稱 dc["title"] = list_str(tree.xpath("//span[@id='chTitle']/text()")) # 外文名稱 dc["title_eng"] = list_str(tree.xpath("//span[@id='enTitle']/text()")) # 作者 dc["author"] = list2str(tree.xpath("//p[text()='%s']/a/text()"%zuozhe)) # 作者數量 length1 = len(tree.xpath("//p[text()='%s']/a/text()"%zuozhe)) # 機構名稱 dc["organization"] = list2str(tree.xpath("//p[text()='%s']/a/text()"%jigou)) # 機構數量 length2 = len(tree.xpath("//p[text()='%s']/a/text()"%jigou)) # 作者英文名, 機構英文名 dc["author_eng"], dc["organization_eng"] = en_ls(tree.xpath("//p[@id='au_en']/text()"), length1, length2) # 摘要 dc["summary"] = list_str(tree.xpath("//span[@id='ChDivSummary']/text()")) # 英文摘要 dc["summary_eng"] = list_str(tree.xpath("//span[@id='EnChDivSummary']/text()")) # 關鍵詞 dc["keywords"] = list2str(tree.xpath("//div[@class='keywords']/span[1]/a/text()")) # 英文關鍵詞 dc["keywords_eng"] = list2str(tree.xpath("//div[@class='keywords']/span[2]/a/text()")) # 會議資訊 dc["proceeding_title"],dc["conference_title"],dc["conference_date"],dc["conference_place"],dc["huiyflh"],dc["conference_org"] = hyxx(tree.xpath("//div[@class='summary']/ul/li/text()")) if dc["proceeding_title"]=="": print 2 dc["proceeding_title"] = list_str(tree.xpath("//div[@class='summary']/ul[1]/li/a/text()")) return dc # 主函式 def main(): db = get_db() collection=db.conference collection2 = db.conference_cleaned for item in collection.find({}, {"html":1,"_id":0}): if item.has_key('html') and item['html']: dc = standard_dict(item['html']) collection2.insert(dc) if __name__ == '__main__': main() # 以下程式碼用於測試清洗特定一條資料 # db = get_db() # collection=db.conference # data = get_data(collection, 1) # dc = standard_dict(data) # for k,v in dc.items(): # print k,v