Python 自用程式碼(知網會議論文網頁原始碼清洗)

右介發表於2017-07-17
#coding=utf-8
from pymongo import MongoClient
from lxml import etree
import requests

jigou = u"\r\n      【機構】\r\n      "
zuozhe = u"\r\n        【作者】\r\n          "

# 獲取資料庫
def get_db():
    client = MongoClient('localhost', 27017)
    db = client.cnki
    db.authenticate("使用者名稱","密碼") 
    return db

# 獲取第num條資料
def get_data(table, num):
    i = 1
    for item in table.find({}, {"html":1,"_id":0}):
        if i==num:
            if item.has_key('html') and item['html']:
                return item['html']
        else:
            i+=1
            continue

# 列表首元素轉字串
def list_str(list):
    if len(list)!=0:
        return list[0]
    else:
        return ""

# 作者英文名,機構英文名
def en_ls(list, length1, length2):
    if len(list)!=0:
        list = list[0].replace(u"【Author】","").replace("\r\n","").strip().split(";")
        if len(list)==(length2+length1)+1:
            return list2str(list[:length1]), list2str(list[length1:-1])
        else:
            return "", ""
    else:
        return "", ""

def hyxx(list):
    if len(list)!=0:
        hylmc,hymc,hysj,hydd,flh,zbdw = "","","","",[],""
        for item in list:
            if u"【會議錄名稱】" in item:
                hylmc = item.replace(u"【會議錄名稱】","").replace("\r\n","").strip()
                continue
            if u"【會議名稱】" in item:
                hymc = item.replace(u"【會議名稱】","").replace("\r\n","").strip()
                continue
            if u"【會議時間】" in item:
                hysj = item.replace(u"【會議時間】","").replace("\r\n","").strip()
                continue
            if u"【會議地點】" in item:
                hydd = item.replace(u"【會議地點】","").replace("\r\n","").strip()
                continue
            if u"【分類號】" in item:
                flh = item.replace(u"【分類號】","").replace("\r\n","").strip()
                continue
            if u"【主辦單位】" in item:
                zbdw = item.replace(u"【主辦單位】","").replace(u"",";").replace("\r\n","").strip()
                continue
        return hylmc,hymc,hysj,hydd,flh,zbdw
    else:
        return "","","","","",""

# 列表轉字串
def list2str(list):
    if len(list)!=0:
        return ";".join(list)
    else:
        return ""    

# 構造論文入庫字典
def standard_dict(html):
    dc = {}
    print 1
    # print html
    tree = etree.HTML(html)
    # 論文名稱
    dc["title"] = list_str(tree.xpath("//span[@id='chTitle']/text()"))
    # 外文名稱
    dc["title_eng"] = list_str(tree.xpath("//span[@id='enTitle']/text()"))
    # 作者
    dc["author"] = list2str(tree.xpath("//p[text()='%s']/a/text()"%zuozhe))
    # 作者數量
    length1 = len(tree.xpath("//p[text()='%s']/a/text()"%zuozhe))
    # 機構名稱
    dc["organization"] = list2str(tree.xpath("//p[text()='%s']/a/text()"%jigou))
    # 機構數量
    length2 = len(tree.xpath("//p[text()='%s']/a/text()"%jigou))
    # 作者英文名, 機構英文名
    dc["author_eng"], dc["organization_eng"] = en_ls(tree.xpath("//p[@id='au_en']/text()"), length1, length2)
    # 摘要
    dc["summary"] = list_str(tree.xpath("//span[@id='ChDivSummary']/text()"))
    # 英文摘要
    dc["summary_eng"] = list_str(tree.xpath("//span[@id='EnChDivSummary']/text()"))
    # 關鍵詞
    dc["keywords"] = list2str(tree.xpath("//div[@class='keywords']/span[1]/a/text()"))
    # 英文關鍵詞
    dc["keywords_eng"] = list2str(tree.xpath("//div[@class='keywords']/span[2]/a/text()"))
    # 會議資訊
    dc["proceeding_title"],dc["conference_title"],dc["conference_date"],dc["conference_place"],dc["huiyflh"],dc["conference_org"] = hyxx(tree.xpath("//div[@class='summary']/ul/li/text()"))
    if dc["proceeding_title"]=="":
        print 2
        dc["proceeding_title"] = list_str(tree.xpath("//div[@class='summary']/ul[1]/li/a/text()"))
    
    return dc

# 主函式
def main():
    db = get_db()
    collection=db.conference
    collection2 = db.conference_cleaned
    for item in collection.find({}, {"html":1,"_id":0}):
        if item.has_key('html') and item['html']:
            dc = standard_dict(item['html'])
            collection2.insert(dc)


if __name__ == '__main__':
    main()
    # 以下程式碼用於測試清洗特定一條資料
    # db = get_db()
    # collection=db.conference
    # data = get_data(collection, 1)
    # dc = standard_dict(data)
    # for k,v in dc.items():
    #     print k,v

 

相關文章