爬蟲專案總結
爬蟲專案使用手冊
專案1 爬取ChemicalBook
-
爬取化合物列表
爬取程式碼:chemical.py
輸出檔案: data.xls -
爬取化合物具體資訊
爬取程式碼:pagedata.py
輸出檔案: pagedata.txt
1.1 爬取CAS號、中文名、英文名、分子式程式碼
# -*- coding: utf-8 -*-
"""
Created on Tue Jul 21 09:49:56 2020
@author: JX
"""
import requests
from bs4 import BeautifulSoup
import re
import xlwt
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'}
base_url = [
'https://www.chemicalbook.com/CASDetailList_{}.htm'.format(i) for i in range(0, 101, 100)
]
print(base_url)
finds1 = re.compile(r'<a class="blue" href="/CAS.*?">(.*?)</a>',re.S)
finds2 = re.compile(r'<a class="blue" href="/ChemicalProductProperty_CN_.*">(.*?)</a>', re.S)
finds3 = re.compile(r'<td width="380">(.*?)</td>', re.S)
finds4 = re.compile(r'<span id="ContentPlaceHolder1_ProductClassDetail_.*">(.*?)</span>', re.S)
def getData():
datalist = []
for url in base_url: # 設定迴圈
print('第{}頁'.format(url))
page = requests.get(url)
# print(page.status_code)
soup = BeautifulSoup(page.content, 'html.parser')
# print(soup.prettify())
for tr in soup.find_all('tr'):
data = []
tr = str(tr)
tr = re.sub('\r\n', " ", tr) # 替換/
s1 = re.findall(finds1, tr)
if s1 != []:
data.append(s1[0])
s2 = re.findall(finds2, tr)
if s2 != []:
data.append(s2[0])
s3 = re.findall(finds3, tr)
if s3 != []:
data.append(s3[0])
s4 = re.findall(finds4, tr)
if s4 != []:
data.append(s4[0])
# print(data)
datalist.append(data)
return (datalist)
def saveData(datalist, savepath):
print("save......")
book = xlwt.Workbook(encoding="utf-8", style_compression=0)
sheet = book.add_sheet('IPA', cell_overwrite_ok=True)
col = ("CAS", "中文名", "英文名", "MF")
for i in range(0, 4):
sheet.write(0, i, col[i])
for i in range(0, 32652):
# print("第%d條" %(i+1))
data = datalist[i]
# print(len(data))
if data != []:
for j in range(0, 4):
sheet.write(i + 1, j, data[j])
book.save(savepath)
if __name__ == "__main__":
datalist = getData()
print(datalist)
savepath = ".\\data1.xls"
del(datalist[0])
#saveData(datalist,savepath)
print("爬取完畢")
- 爬取單頁資料
# -*- coding: utf-8 -*-
"""
Created on Wed Jul 22 09:20:31 2020
@author: JX
"""
import requests
import re
from bs4 import BeautifulSoup
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'}
url = "https://www.chemicalbook.com/CAS_5446-18-4.htm"
#儲存資源
def save_contents(urlist):
with open("./data.txt",'a+',encoding = 'utf-8') as f:
for i in urlist:
f.write(i)
#f.write(' ')
page = requests.get(url)
newp=page.text.replace('<br />','')
print(page.status_code)
soup = BeautifulSoup(page.content, 'html.parser')
trs = soup.find_all('div',id="ContentPlaceHolder1_SubClass")
for tr in trs:
for td in tr.stripped_strings:
#print(td)
save_contents(td)
with open('data.txt','r',encoding='utf-8') as f:
dic=[]
for line in f.readlines():
#line = str(line).replace("\n","")
b=re.split('【',line)
dic.append(b)
dic=str(dic)
#save_contents(str(dic))
dic = re.sub('】',":",dic)
print(dic)
專案2 爬取IPA資料庫,單頁資料的獲取程式碼
# -*- coding: utf-8 -*-
"""
Created on Thu Jul 23 16:31:25 2020
@author: JX
"""
from bs4 import BeautifulSoup
import re
import urllib.request,urllib.error
import xlwt
import unicodedata
finds0 = re.compile(r'<td class="tableheadbkgr">(.*?)</td>',re.S)
finds1 = re.compile(r'<td class="b1" width="715">(.*?)</td>',re.S)
finds2 = re.compile(r'<td class="a1" width="715">(.*?)</td>',re.S)
finds3 = re.compile(r'<td align="left" class="b1" width="715">(.*?)</td>',re.S)
finds4 = re.compile(r'<td align="left" class="a1" width="715">(.*?)</td>',re.S)
def remove(tr):
tr = re.sub('<br(\s+)?/>(\s+)?'," ",tr)
tr = re.sub('<sub(\s+)?>(\s+)?'," ",tr)
tr = re.sub('</sub(\s+)?>(\s+)?'," ",tr)
tr = re.sub('<a.*?>'," ",tr)
tr = re.sub('</a(\s+)?>(\s+)?'," ",tr)
tr = re.sub('<span(\s+)?>(\s+)?'," ",tr)
tr = re.sub('<span class="tableinstructional0">', "", tr)
tr = re.sub('<span id="intNetworkLink"', "1", tr)
tr = re.sub('</span(\s+)?>(\s+)?'," ",tr)
tr = re.sub('--',"",tr)
tr = re.sub('Interaction', "", tr)
tr = re.sub('Network', "", tr)
tr = re.sub('>', "", tr)
tr = re.sub('1>', "", tr)
tr = re.sub('IPA Chem View:', "", tr)
tr = unicodedata.normalize('NFKC', tr)
tr = tr.replace('\n', "")
return tr
def getData(url):
data = []
kong = []
fp = open(url,'r',encoding='utf-8')
soup = BeautifulSoup(fp,'html.parser')
res0 = str(soup.find('td', class_="tableheadbkgr"))
res0 = remove(res0)
s0 = re.findall(finds0, res0)
#print(s0)
data.append(s0)
res = soup.find_all('table',class_="tablenodeviewcontainer")
for tr in res:
tr = str(tr)
tr = remove(tr)
s1 = re.findall(finds1,tr)
s1 = [[i,] for i in s1]
if len(s1) == 8:
for i in range(len(s1)):
data.append(s1[i])
else:
for i in range(len(s1)):
data.append(s1[i])
for i in range(8-len(s1)):
data.append(kong)
s2 = re.findall(finds2,tr)
s2 = [[i,] for i in s2]
if len(s2) == 8:
for i in range(len(s2)):
data.append(s2[i])
else:
for i in range(len(s2)):
data.append(s2[i])
for i in range(8-len(s2)):
data.append(kong)
s3 = re.findall(finds3,tr)
s3 = [[i,] for i in s3]
if len(s3) == 2:
for i in range(len(s3)):
data.append(s3[i])
else:
for i in range(len(s3)):
data.append(s3[i])
for i in range(2-len(s3)):
data.append(kong)
s4 = re.findall(finds4,tr)
s4 = [[i, ] for i in s4]
if len(s4) == 3:
for i in range(len(s4)):
data.append(s4[i])
else:
for i in range(len(s4)):
data.append(s4[i])
for i in range(3-len(s4)):
data.append(kong)
return data
def saveData(datalist,savepath,n):
print("save......")
book = xlwt.Workbook(encoding="utf-8",style_compression=0)
sheet = book.add_sheet('IPA',cell_overwrite_ok=True)
col = ("Symbol","Synonyms","IUPAC Name","SMILES","Chemical Formula","PubChem Link","HMDB Link","regulated by","role in cell",
"Systematic Name","CAS Registry Number","InChI","Molecular Weight","Canonical Pathways","regulates","binds","disease",
"Members of Subgroup","Manufacturer","Member of Groups","Brand Name","Therapeutic Categories")
for i in range(0,len(col)):
sheet.write(0,i,col[i])
for i in range(0,len(datalist)):
sheet.write(n,i,datalist[i])
book.save(savepath)
def askURL(url):
head = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.81 Safari/537.36 SE 2.X MetaSr 1.0"
}
request = urllib.request.Request(url,headers = head)
html = ""
try:
response = urllib.request.urlopen(request)
html = response.read().decode("utf-8")
except urllib.error.URLError as e:
if hasattr(e,"code"):
print(e.code)
if hasattr(e,"reason"):
print(e.reason)
return html
if __name__ == "__main__":
savepath = ".\\aspirin.xls"
datalist = getData("./IPA Chem View aspirin IPA.htm")
saveData(datalist,savepath,1)
print("爬取完畢")
專案3 爬取chem960
-
chem資料夾:化合物
輸入檔案:chem960_cas_list.txt
爬取程式碼:chemreptile.py
輸出檔案: chem960.txt
-
plant資料夾:植物
輸入檔案:chem960_plant_list.txt
爬取程式碼:plant.py
輸出檔案: plant.txt
-
對應關係:
plant-chem.txt :植物-化合物關係主表
plant-chem1.txt :植物-化合物關係補充表
3.1 爬取化合物程式碼
# -*- coding: utf-8 -*-
"""
Created on Wed Jul 22 09:20:31 2020
@author: JX
"""
from bs4 import BeautifulSoup
import re
import urllib.request,urllib.error
CASid = []
CAS_file = open('./chem960_cas_list.txt', 'r', encoding='utf-8')
for line in CAS_file.readlines():
line = line.strip('\n')
if line !='':
CASid.append(line)
finds0 = re.compile(r'<span class="font-333".*?>(.*?)</span>',re.S)
finds1 = re.compile(r'<b class="blue-light">(.*?)</b>',re.S)
finds2 = re.compile(r'<span class="font-666">(.*?)</span>',re.S)
finds3 = re.compile(r'<a class="blue-light".*?>(.*?)</a>',re.S)
finds4 = re.compile(r'<a class="blue-light" href="(.*?)".*?>.*?</a>',re.S)
def remove(tr):
tr = re.sub('<sub(\s+)?>(\s+)?'," ",tr)
tr = re.sub('</sub(\s+)?>(\s+)?'," ",tr)
return tr
def askURL(url):
head = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36',
'Cookie':'Abp.Localization.CultureName=zh-CN; _tempuserid=-132410786458757023; ASP.NET_SessionId=yriedc1wg4rjus5ujo4cf3kw; __RequestVerificationToken=vmt6p0lZkJfMiYcD2fynEVcsNotQhSmkZRXfBvIbd5iJYqHXTKG5o6vBIpWEGbDa9GG852NyiEvi-0SmUMvytCMy7Dg1; Hm_lvt_e96d4d23e997677f26ac69b89fc71ec7=1596605046,1596688605; XSRF-TOKEN=PbrL6FUCtOR653DCqLq0fZy8Hutu-qnUHce1sxk2NCJpJuypiuVkSyNLa19aKLvZRatCRmy_1AAl69Rt6SowTAbFI7U1; Hm_lpvt_e96d4d23e997677f26ac69b89fc71ec7=1596688760',
}
request = urllib.request.Request(url,headers = head)
html = ""
try:
response = urllib.request.urlopen(request)
html = response.read().decode("utf-8")
except urllib.error.URLError as e:
if hasattr(e,"code"):
print(e.code)
if hasattr(e,"reason"):
print(e.reason)
return html
def save_contents(urlist):
with open("./chem960.txt",'a+',encoding = 'utf-8') as f:
#lines="\t".join([";".join(j) for j in i ]).strip()
f.writelines(str(urlist)+'\n')
kong = []
for url in CASid: # 設定迴圈
data = []
print('第{}頁'.format(url))
html = askURL(url)
soup = BeautifulSoup(html, 'html.parser')
result = soup.find_all('div',class_="l cas-inner")
result = str(result)
result = remove(result)
s0 = re.findall(finds0,result)
if s0 !=[]:
data.append(s0[0])
if len(s0)==1:
data.append(kong)
else:
data.append(s0[1])
s1 = re.findall(finds1,result)
if s1 !=[]:
data.append(s1[0])
s2 = re.findall(finds2,result)
if s2 !=[]:
data.append(s2[0])
data.append(s2[1])
s3 = re.findall(finds3,result)
s4 = re.findall(finds4,result)
if s3 and s4 !=[]:
for i in range(len(s3)):
data.append(s3[i]+':'+s4[i])
save_contents(data)
3.2 爬取植物程式碼
# -*- coding: utf-8 -*-
"""
Created on Wed Jul 22 09:20:31 2020
@author: JX
"""
from bs4 import BeautifulSoup
import re
import urllib.request,urllib.error
import time
import unicodedata
import requests
finds = re.compile(r'<span style="font-size:18px;">(.*?)</span>',re.S)
finds0 = re.compile(r'<strong>(.*?)</strong>',re.S)
finds1 = re.compile(r'<a href=".*?" target="_blank">(.*?)</a>',re.S)
finds2 = re.compile(r'<span>(.*?)</span>',re.S)
plantid = []
plant_file = open('./test.txt', 'r', encoding='utf-8')
for line in plant_file.readlines():
line = line.strip('\n')
if line !='':
plantid.append(line)
def remove(tr):
tr = re.sub('<sub(\s+)?>(\s+)?'," ",tr)
tr = re.sub('</sub(\s+)?>(\s+)?'," ",tr)
tr = re.sub('(', " ", tr)
tr = re.sub(')', " ", tr)
#tr = re.sub('<span style="font-size:18px;">(\s+)?'," ",tr)
#tr = re.sub('</span(\s+)?>(\s+)?'," ",tr)
tr = unicodedata.normalize('NFKC', tr)
return tr
def askURL(url):
head = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36',
'Cookie':'Abp.Localization.CultureName=zh-CN; _tempuserid=-132410786458757023; ASP.NET_SessionId=yriedc1wg4rjus5ujo4cf3kw; __RequestVerificationToken=vmt6p0lZkJfMiYcD2fynEVcsNotQhSmkZRXfBvIbd5iJYqHXTKG5o6vBIpWEGbDa9GG852NyiEvi-0SmUMvytCMy7Dg1; Hm_lvt_e96d4d23e997677f26ac69b89fc71ec7=1596605046,1596688605; XSRF-TOKEN=2a7785UE6WhyURBYISJQHFzhMqcSY6JMFHtCwGQfg15GcjCsMgzEzsaC7fA2TMhGCHVdezeiYMJZ3yX7ASYSxGRbzJ41; Hm_lpvt_e96d4d23e997677f26ac69b89fc71ec7=1596690112',
}
request = urllib.request.Request(url,headers = head)
html = ""
try:
response = urllib.request.urlopen(request)
html = response.read().decode("utf-8")
except urllib.error.URLError as e:
if hasattr(e,"code"):
print(e.code)
if hasattr(e,"reason"):
print(e.reason)
return html
#url="https://www.chem960.com/unpd/s475"
datalist = []
def getData():
for url in plantid: # 設定迴圈
data =[]
data1 = []
print('第{}頁'.format(url))
html = askURL(url)
soup = BeautifulSoup(html, 'html.parser')
name = soup.find_all('div',class_="nright")
name = str(name)
name = remove(name)
#name = re.sub('</span(\s+)?>(\s+)?', "", name)
s0 = re.findall(finds0,name)
if s0 != []:
data.append(s0)
else:
s = re.findall(finds, name)
data.append(s)
result = soup.find_all('td', class_="natural-hhw-list")
result = str(result)
s1 = re.findall(finds1, result)
if s1 != []:
result1 = soup.find_all('div', class_="casno")
result1 = str(result1)
result1 = remove(result1)
s2 = re.findall(finds2, result1)
if s2 != []:
num = int(len(s2)/3)
a = int(len(s2) / num)
b = int(len(s2) / num) + 1
for i in range(1, len(s2)+1, int(len(s2)/num)):
s1[a] = s2[i]
s1[b] = s2[i+1]
a += int(len(s1)/num)
b += int(len(s1)/num)
for i in range(0, len(s1), int(len(s1)/num)):
b = s1[i:i + int(len(s1)/(num))]
data.append(b)
for i in range(num+1):
if i!=0:
data1.append(data[0])
data1.append(data[i])
datalist.append(data1)
return datalist
def save_contents(urlist):
with open("./plant.txt",'w',encoding = 'utf-8') as f:
n=0
for i in urlist:
for j in i:
line = ";".join(j).strip()
f.write(str(line))
f.write('\t')
n += 1
if n%2==0:
f.write('\n')
#lines = "\t".join([";".join(j) for j in i]).strip()
#f.write(lines)
if __name__ == "__main__":
datalist = getData()
save_contents(datalist)
3.3 爬取植物介紹
# -*- coding: utf-8 -*-
"""
Created on Wed Jul 22 09:20:31 2020
@author: JX
"""
from bs4 import BeautifulSoup
import re
import urllib.request,urllib.error
plantid = []
plant_file = open('./test.txt', 'r', encoding='utf-8')
for line in plant_file.readlines():
line = line.strip('\n')
if line !='':
plantid.append(line)
def remove1(jieshao):
jieshao = re.sub('<br/></br></blockquote>', "", str(jieshao))
jieshao = re.sub('<blockquote.*?<br>', "", str(jieshao))
jieshao = re.sub('<blockquote.*?>', "", str(jieshao))
jieshao = re.sub('<blockquote.*?<p>', "", str(jieshao))
jieshao = re.sub('<strong.*?</strong>', "", str(jieshao))
jieshao = re.sub('<p.*?>', "", str(jieshao))
jieshao = re.sub('</p>', "", str(jieshao))
jieshao = re.sub('</blockquote>', "", str(jieshao))
jieshao = re.sub('<br/>', "", str(jieshao))
jieshao = re.sub('<span.*?>(\s+)?', "", str(jieshao))
jieshao = re.sub('</span(\s+)?>(\s+)?', "", str(jieshao))
jieshao = re.sub('<div.*?>', "", str(jieshao))
jieshao = re.sub('</div>', "", str(jieshao))
jieshao = re.sub('<br>', "", str(jieshao))
jieshao = re.sub('</br>', "", str(jieshao))
jieshao = re.sub('<a></a>', "", str(jieshao))
jieshao = re.sub('\s+', '', str(jieshao)).strip()
return jieshao
def askURL(url):
head = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36',
'Cookie':'Abp.Localization.CultureName=zh-CN; _tempuserid=-132410786458757023; ASP.NET_SessionId=yriedc1wg4rjus5ujo4cf3kw; __RequestVerificationToken=vmt6p0lZkJfMiYcD2fynEVcsNotQhSmkZRXfBvIbd5iJYqHXTKG5o6vBIpWEGbDa9GG852NyiEvi-0SmUMvytCMy7Dg1; Hm_lvt_e96d4d23e997677f26ac69b89fc71ec7=1596605046,1596688605; XSRF-TOKEN=2a7785UE6WhyURBYISJQHFzhMqcSY6JMFHtCwGQfg15GcjCsMgzEzsaC7fA2TMhGCHVdezeiYMJZ3yX7ASYSxGRbzJ41; Hm_lpvt_e96d4d23e997677f26ac69b89fc71ec7=1596690112',
}
request = urllib.request.Request(url,headers = head)
html = ""
try:
response = urllib.request.urlopen(request)
html = response.read().decode("utf-8")
except urllib.error.URLError as e:
if hasattr(e,"code"):
print(e.code)
if hasattr(e,"reason"):
print(e.reason)
return html
#url="https://www.chem960.com/unpd/s475"
datalist = []
def save_contents(urlist):
with open("./jieshao.txt",'a+',encoding = 'utf-8') as f:
lines="".join(urlist).strip()
for i in range(num):
f.writelines(lines+'\n')
for url in plantid: # 設定迴圈
data =[]
print('第{}頁'.format(url))
html = askURL(url)
soup = BeautifulSoup(html, 'html.parser')
result1 = soup.find_all('div', class_="casno")
result1 = str(result1)
finds2 = re.compile(r'<span>(.*?)</span>', re.S)
s2 = re.findall(finds2, result1)
if s2 != []:
num = int(len(s2) / 3)
print(num)
jieshao = soup.find('blockquote', class_="layui-elem-quote maigin-top-5")
jieshao = remove1(jieshao)
if jieshao==None:
data.append([])
else:
data.append(jieshao.strip())
save_contents(data)
專案4 爬取植物資料庫
-
cookie資料夾:測試獲取cookie的方法
cookie.py requests方法
ip.py ip代理池方法
selenium.py selenium方法
session.py session方法
-
plantname資料夾:獲取植物名字
輸入檔案:MF.txt
爬取程式碼:formdate.py
輸出檔案: plantname.txt
去重程式碼:delete.py
-
植物-化合物對應關係:
plant-chem.py :植物-化合物程式碼
4.1 爬取植物名字
import requests
from bs4 import BeautifulSoup
import re
import urllib.request
import time
import json
import http.cookiejar as cj
from itertools import islice
def getMF():
MFlist = []
MF_file = open('./MF1.txt', 'r', encoding='utf-8')
for line in MF_file.readlines():
line = line.strip('\n')
if line !='':
MFlist.append(line)
return MFlist
def removes(dd):
dd = re.sub("; \('", "\t", dd)
dd = re.sub('&', "&", dd)
dd = re.sub('"', "", dd)
dd = re.sub("'\)", "\t", dd)
dd = re.sub("', '", "sss666", dd)
dd = re.sub("sss666", "\t", dd)
dd = re.sub("',", "", dd)
return dd
n=1
cw=1
def search(MF):
url = 'http://chemdb.sgst.cn/ssdb/plant/plant_R2.asp'
#formdata = {'tid':'238620', 'm_PlantID':'3','pageno':'99'}
jiansuo = {'MF':MF, 'flag':'0','Page':'198'}
head = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36',
'Cookie':'ASPSESSIONIDSQSCACAD=NIPBNOPACHLJKHFPHGLMDFPK',
'Origin': 'http://chemdb.sgst.cn',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh-TW;q=0.9,zh;q=0.8,en-US;q=0.7,en;q=0.6',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'Content-Length': '86',
'Content-Type': 'application/x-www-form-urlencoded',
}
try:
r = requests.post(url, data=jiansuo,headers= head)
r.raise_for_status()
r.encoding = "GBK"
print('連結成功')
return r.text
except urllib.error.URLError as e:
if hasattr(e, "code"):
print(e.code)
if hasattr(e, "reason"):
print(e.reason)
for MF in getMF():
finds0 = re.compile(r'<font color="red">(.*?)</font>',re.S)
finds1 = re.compile(r'<a href="plant_descript.asp(.*?)>(.*?)</a>',re.S)
data=[]
time.sleep(3)
s = search(MF)
soup = BeautifulSoup(s, 'html.parser')
fcuo = re.compile(r'404', re.S)
cuowu = re.findall(fcuo, str(s))
if cuowu != []:
cw += 1
if cw == 10:
path = './MF1.txt'
path_out = './MF_out.txt'
with open(path) as f, open(path_out, 'w+') as f_out:
for a in islice(f, n, 100000, 1):
f_out.write(a)
cw=1
break
print(cw)
#print('第{}條'.format(n))
n+=1
table = soup.find('div')
table = str(table)
s0 = re.findall(finds0,table)
s1 = re.findall(finds1,table)
if len(s1) != 0:
for i in range(len(s1)):
with open("./form.txt",'a+',encoding = 'utf-8') as f:
dd = s0[1] + str(s1[i])
dd = removes(dd)
f.writelines(dd + '\n')
else:
with open("./form.txt", 'a+', encoding='utf-8') as f:
f.writelines(str(MF) + '\t'+'None'+'\n')
4.2 爬取植物—化合物
# -*- coding: utf-8 -*-
"""
Created on Thu Aug 20 10:27:44 2020
@author: JX
"""
import time
import requests
from bs4 import BeautifulSoup
import re
import random
def plant():
url = 'http://chemdb.sgst.cn/ssdb/plant/Species_des.asp'
formdata = {'m_PlantID':'1', 'tid':'242263'}
head = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3775.400 QQBrowser/10.6.4208.400',
'Cookie':'ASPSESSIONIDQQTADDAD=LLNHIEEDONIDKJOIHOFPNEEH; __jsluid_h=dada7ae00f096479097a121016cd042c',
'Host': 'www.organchem.csdb.cn',
'Upgrade-Insecure-Requests':'1',
'Referer':'http://www.organchem.csdb.cn/scdb/plant/plant_R.asp',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'Content-Length': '86',
'Content-Type': 'application/x-www-form-urlencoded',
}
try:
r = requests.post(url, data=formdata,headers= head)
time.sleep(random.random() * 3)
r.raise_for_status()
r.encoding = "GBK"
print('連結成功')
except:
print('連結失敗')
data = []
kong = []
#fp = open(url,'r',encoding='GBK')
soup = BeautifulSoup(r.text,'html.parser')
finds0 = re.compile(r'<a href="(.*?)">.*?</a>', re.S)
name = soup.find('td', class_='content_project3')
with open("./plant.txt", 'a+', encoding='utf-8') as f:
if name!=None:
name = name.get_text()
name = re.sub('\s+', '', str(name)).strip()
f.writelines("物種: "+name +'\n')
xuename = soup.find('td', class_='content_project1')
if xuename!=None:
xuename = xuename.get_text()
xuename = re.sub('\s+', '', str(xuename)).strip()
f.writelines("學名: "+xuename +'\n')
miaoshu = soup.select('table[class="table2"] td[class="content_project1"] p')
if len(miaoshu):
for c in miaoshu:
#print("植物描述: "+c.text.strip())
f.writelines("植物描述: "+c.text.strip() +'\n')
else:
title = soup.select('td[class="title_project1"]')
find = re.compile(r'植物描述', re.S)
title = re.findall(find, str(title))
if '植物描述' in title:
zi = soup.select('table[class="table2"] td[class="content_project1"]')
finds1 = re.compile(r'<td class="content_project1">.*?<td class="content_project1">(.*?)</td>.*?', re.S)
miaoshu1 = re.findall(finds1, str(zi))
for c in miaoshu1:
f.writelines("植物描述: "+c.strip() +'\n')
#print("植物描述: "+c.strip())
else:
print("植物描述空")
fenlei = soup.select('a[href*="_des.asp"]')
for c in fenlei:
f.writelines("植物分類資訊: "+ c.text.strip() +'\n')
#print("植物分類資訊: "+ c.text.strip())
species = soup.select('a[href*="Species_compounds"]')
species = re.findall(finds0, str(species))
species = re.sub('&', "&", str(species))
#print("化學成分: "+species)
f.writelines("化學成分: "+species +'\n')
finds3 = re.compile(r'<td class="content_project1" title="植物的.*?">(.*?)</td>.*?', re.S)
disease = re.findall(finds3, str(soup))
for d in disease:
f.writelines("可能有效疾病: "+ d.strip() +'\n')
#print("可能有效疾病: "+ d.strip())
image = soup.select('div[class="jqzoom"] img')
finds4 = re.compile(r'<img.*?jqimg="(.*?)".*?/>', re.S)
image = re.findall(finds4, str(image))
for i in image:
f.writelines("植物圖片: "+ i.strip() +'\n')
#print("植物圖片: "+ i.strip())
plantlit = soup.select('a[href^="plant_lit"]')
for c in plantlit:
f.writelines("研究文獻: "+c.text.strip()+'\n')
#print("研究文獻: "+c.text.strip())
#stripped_strings
cankao = soup.select("td:nth-of-type(25)")
for c in cankao:
f.writelines("命名參考: "+c.text.strip() +'\n')
#print("命名參考: "+c.text.strip())
plant()
'''
def getpage():
url = 'http://www.organchem.csdb.cn/scdb/plant/Species_compounds.asp'
formdata = {'m_PlantID':'1', 'tid':'242236'}
head = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36',
'Cookie':'ASPSESSIONIDSSRBABDC=MEAGDGECDGEALGILLHLAwwwDOHB; Hm_lvt_2d9981563033253e57169917f0e2a4d5=1597824270; Hm_lpvt_2d9981563033253e57169917f0e2a4d5=1597824270; ASPSESSIONIDSQTCCDAD=NJJPDIHCEMOLBEKNOOFABFAM',
'Host': 'www.organchem.csdb.cn',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh-TW;q=0.9,zh;q=0.8,en-US;q=0.7,en;q=0.6',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'Content-Length': '86',
'Content-Type': 'application/x-www-form-urlencoded',
}
try:
r = requests.post(url, data=formdata,headers= head)
r.raise_for_status()
r.encoding = "GBK"
except:
print('連結失敗')
soup = BeautifulSoup(r.text,'html.parser')
findpage = re.compile(r'當前頁碼:1/(.*?),10條紀錄/頁', re.S)
page = re.findall(findpage, str(soup))
page = int(page[0])
'''
def srn():
url = 'http://chemdb.sgst.cn/ssdb/plant/Species_compounds.asp?'
for pn in range(1,66):
print(pn)
formdata = {'m_PlantID':'1', 'tid':'242263','pageno':pn}
head = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3775.400 QQBrowser/10.6.4208.400',
'Cookie':'ASPSESSIONIDQQTADDAD=LLNHIEEDONIDKJOIHOFPNEEH; __jsluid_h=dada7ae00f096479097a121016cd042c',
'Host': 'www.organchem.csdb.cn',
'Upgrade-Insecure-Requests':'1',
'Referer':'http://www.organchem.csdb.cn/scdb/plant/plant_R.asp',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'Content-Length': '86',
'Content-Type': 'application/x-www-form-urlencoded',
}
try:
r = requests.post(url, data=formdata,headers= head)
time.sleep(random.random() * 3)
r.raise_for_status()
r.encoding = "GBK"
print('連結成功')
except:
print('連結失敗')
soup = BeautifulSoup(r.text,'html.parser')
srn = soup.select('a[href*="str_cas_property.asp"]')
name = soup.find('td', class_='content_project1')
if name!=None:
name = name.get_text()
name = re.sub('\s+', '', str(name)).strip()
for s in srn:
print(name+": "+s.text.strip())
num = s.text.strip()
url1 = 'http://chemdb.sgst.cn/ssdb/str/str_cas_property.asp'
formdata1 = {'srn':num}
head1 = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3775.400 QQBrowser/10.6.4208.400',
'Cookie':'ASPSESSIONIDQQTADDAD=LLNHIEEDONIDKJOIHOFPNEEH; __jsluid_h=dada7ae00f096479097a121016cd042c',
'Host': 'www.organchem.csdb.cn',
'Upgrade-Insecure-Requests':'1',
'Referer':'http://www.organchem.csdb.cn/scdb/plant/plant_R.asp',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'Content-Length': '86',
'Content-Type': 'application/x-www-form-urlencoded',
}
try:
r = requests.post(url1,headers= head1,data=formdata1)
time.sleep(random.random() * 3)
r.raise_for_status()
r.encoding = "GBK"
except:
print('連結失敗')
soup = BeautifulSoup(r.text,'html.parser')
rowline = [row.get_text(strip=True) for index, row in enumerate(soup.find_all("tr"), start=1) if index <=4] #index % 2 == 0])
with open("./chemtest.txt", 'a+', encoding='utf-8') as f:
f.writelines(name+": "+s.text.strip() + '\t' +str(rowline) +'\n')
srn()
相關文章
- 爬蟲專案爬蟲
- python爬蟲實操專案_Python爬蟲開發與專案實戰 1.6 小結Python爬蟲
- 【爬蟲】爬蟲專案推薦 / 思路爬蟲
- 爬蟲小專案爬蟲
- 爬蟲專案部署爬蟲
- 爬蟲細節總結爬蟲
- 爬蟲個人總結爬蟲
- 奇伢爬蟲專案爬蟲
- scrapyd 部署爬蟲專案爬蟲
- 網路爬蟲專案爬蟲
- 網路爬蟲流程總結爬蟲
- Java 爬蟲專案實戰之爬蟲簡介Java爬蟲
- python爬蟲初探--第一個python爬蟲專案Python爬蟲
- Python網路爬蟲實戰專案大全 32個Python爬蟲專案demoPython爬蟲
- 爬蟲實戰專案集合爬蟲
- 網路爬蟲(python專案)爬蟲Python
- 100爬蟲專案遷移爬蟲
- gerapy框架爬蟲專案部署框架爬蟲
- 爬蟲專案實戰(一)爬蟲
- 專案--python網路爬蟲Python爬蟲
- 爬蟲的例項專案爬蟲
- 爬蟲實戰專案合集爬蟲
- Python爬蟲教程-31-建立 Scrapy 爬蟲框架專案Python爬蟲框架
- python爬蟲例項專案大全-GitHub 上有哪些優秀的 Python 爬蟲專案?Python爬蟲Github
- 猿人學爬蟲攻防賽總結爬蟲
- 分散式爬蟲總結和使用分散式爬蟲
- python爬蟲-33個Python爬蟲專案實戰(推薦)Python爬蟲
- 精通Scrapy網路爬蟲【一】第一個爬蟲專案爬蟲
- 企業資料爬蟲專案爬蟲
- Java爬蟲專案環境搭建Java爬蟲
- 中科院爬蟲完整專案爬蟲
- 32個Python爬蟲專案demoPython爬蟲
- 爬蟲專案:大麥網分析爬蟲
- Python爬蟲開源專案合集Python爬蟲
- 使用 Golang 寫爬蟲經驗總結Golang爬蟲
- github上的python爬蟲專案_GitHub - ahaharry/PythonCrawler: 用python編寫的爬蟲專案集合GithubPython爬蟲
- (python)爬蟲----八個專案帶你進入爬蟲的世界Python爬蟲
- 一入爬蟲深似海,總結python爬蟲學習筆記!爬蟲Python筆記