NSTL國家科技圖書文獻中心 2017 機械 儀表工業 所有期刊論文資訊
程式碼比較隨意,不要介意
第一步,爬取所有期刊連結
#coding=utf-8 import time from selenium import webdriver from lxml import etree from pymongo import MongoClient client = MongoClient("IP", 27017) db = client["nstl"] collection=db["journal_urls"] db.authenticate("","") driver = webdriver.Chrome(executable_path=r"D:\chromedriver_win32\chromedriver.exe") driver.get('https://www.nstl.gov.cn/facade/search/clcSearch.do?&lan=eng&clc=TH') html = driver.page_source tree = etree.HTML(html) count = int(tree.xpath("//span[@id='totalPages1']/text()")[0]) # 共47頁 for i in range(count): html = driver.page_source tree = etree.HTML(html) # 提取當前頁所有期刊連結並儲存 table = tree.xpath("//div[@class='s2listtd2']/span/a/@href") for j in table: bson = {} bson['url'] = j collection.insert(bson) # i等於46時終止 if i==(count-1): break # 點選接下來一頁按鈕 driver.find_element_by_xpath('//div[@id="page"]/div//a[text()="%s"]'%str(i+2)).click() # 判斷翻頁成功後跳出while while True: time.sleep(1) if driver.page_source!=html: break driver.close()
第二步,爬取每個期刊中所有2017年論文連結
#coding=utf-8 import requests from pymongo import MongoClient from lxml import etree from selenium import webdriver import time client = MongoClient("IP", 27017) db = client["nstl"] collection1=db["journal_urls"] collection2=db["journalArticle2017_urls"] db.authenticate("","") driver = webdriver.Chrome(executable_path=r"D:\chromedriver_win32\chromedriver.exe") # 迴圈所有期刊連結 for item in collection1.find({}, {"url":1, "_id":0}): driver.get(item['url'][29:-4]) html = driver.page_source tree = etree.HTML(html) # 判斷如果有18年論文,需要點選出17年論文 table_2018 = tree.xpath("//div[@id='year_2018']") if table_2018!=[]: driver.find_element_by_xpath("//div[@id='year_2017']").click() time.sleep(1) driver.find_element_by_xpath("//div[@id='volumeUl_2017']/div[@class='ltreebom2']").click() # 獲取17年期的個數並迴圈 table = tree.xpath("//div[@id='volumeUl_2017']//div[@class='ltreebom3']/a") for i in range(1, len(table)+1): wen_html = driver.page_source wen_tree = etree.HTML(wen_html) # 獲取當前一期的所有論文連結 wen_table = tree.xpath("//div[@class='s2listtd2']/a/@href") for j in wen_table: bson = {} bson['url'] = j collection2.insert(bson) # 判斷結束迴圈 if i==len(table): break # 點選出下一期論文 try: driver.find_element_by_xpath("//div[@id='volumeUl_2017']//div[@class='ltreebom3'][%s]"%str(i+1)).click() except: break # 判斷是否點選成功 while True: time.sleep(1) if driver.page_source!=wen_html: break driver.close()
第三步,爬取論文資訊詳情頁原始碼
#coding=utf-8 import requests from pymongo import MongoClient from lxml import etree from selenium import webdriver import time client = MongoClient("IP", 27017) db = client["nstl"] collection=db["journalArticle2017_urls"] collection1=db["journalArticle2017_codes"] db.authenticate("","") driver = webdriver.Chrome(executable_path=r"D:\chromedriver_win32\chromedriver.exe") # 迴圈所有論文並構造連結 for item in collection.find({}, {"url":1, "_id":0}): url = "https://www.nstl.gov.cn/facade/search/toFullView.do?checkedSEQNO="+item['url'][23:-11]+"&subDocType="+item['url'][-8:-3] # # post方法獲取當前頁原始碼 # for i in range(100): # try: # result = requests.post(url, verify = False) # except: # time.sleep(1) # continue # html = result.text # if html: # break # 模擬瀏覽器獲取原始碼, 得到含有文獻資料的原始碼後跳出迴圈 driver.get(url) for i in range(100): time.sleep(1) if driver.page_source!=html: break # 儲存 bson = {} html1 = driver.page_source bson['html'] = html1 collection1.insert(bson) driver.close()
第四步,解析原始碼
#coding=utf-8 from pymongo import MongoClient from lxml import etree client = MongoClient("IP", 27017) db = client["nstl"] collection1 = db["journalArticle2017_codes"] collection2 = db["journalArticle2017_data"] db.authenticate("","") zzdw, km, ma, cbn, j, q, qy, zy, zys, flh, gjc, yz, wz = u'【作者單位】:', u'【刊名】:', u'【ISSN】:', u'【出版年】:', u'【卷】:', u'【期】:', u'【起頁】:', u'【止頁】:', u'【總頁數】:', u'【分類號】:', u'【關鍵詞】:', u'【語種】:', u'【文摘】:' # 迴圈所有論文並構造連結 n = 0 for item in collection1.find({}, {"html":1, "_id":0}): html = item["html"] tree = etree.HTML(html) title = tree.xpath("//span[@name='title']/text()") author = tree.xpath("//a[starts-with(@href,'javascript:searchByAuthor')]/text()") organization = tree.xpath("//div[text()='%s']/following-sibling::*/text()"%zzdw) journal_name = tree.xpath("//div[text()='%s']/following-sibling::*/text()"%km) issn = tree.xpath("//div[text()='%s']/following-sibling::*/text()"%ma) publication_year = tree.xpath("//div[text()='%s']/following-sibling::*/text()"%cbn) volume = tree.xpath("//div[text()='%s']/following-sibling::*/text()"%j) issue = tree.xpath("//div[text()='%s']/following-sibling::*/text()"%q) page_start = tree.xpath("//div[text()='%s']/following-sibling::*/text()"%qy) page_end = tree.xpath("//div[text()='%s']/following-sibling::*/text()"%zy) page_count = tree.xpath("//div[text()='%s']/following-sibling::*/text()"%zys) clc = tree.xpath("//div[text()='%s']/following-sibling::*/text()"%flh) keywords = tree.xpath("//div[text()='%s']/following-sibling::*/span/a/text()"%gjc) language = tree.xpath("//div[text()='%s']/following-sibling::*/text()"%yz) summary = tree.xpath("//div[text()='%s']/following-sibling::*/text()"%wz) dc = {} dc['title'] = title[0] if author: dc['author'] = author if organization: dc['organization'] = organization[0] if journal_name: dc['journal_name'] = journal_name[0] if issn: dc['issn'] = issn[0] if publication_year: dc['publication_year'] = publication_year[0] if volume: dc['volume'] = volume[0] if issue: dc['issue'] = issue[0] if page_start: dc['page_start'] = page_start[0] if page_end: dc['page_end'] = page_end[0] if page_count: dc['page_count'] = page_count[0] if clc: dc['clc'] = clc[0] if keywords: dc['keywords'] = keywords[0] if language: dc['language'] = language[0] if summary: dc['summary'] = summary[0] collection2.insert(dc)