爬蟲專案總結

POWERFULU發表於2020-08-31

爬蟲專案使用手冊

專案1 爬取ChemicalBook

  1. 爬取化合物列表

    爬取程式碼:chemical.py
    輸出檔案: data.xls

  2. 爬取化合物具體資訊

    爬取程式碼:pagedata.py
    輸出檔案: pagedata.txt

1.1 爬取CAS號、中文名、英文名、分子式程式碼

    # -*- coding: utf-8 -*-
    """
    Created on Tue Jul 21 09:49:56 2020
    
    @author: JX
    """
    
    import requests
    from bs4 import BeautifulSoup
    import re
    import xlwt
    
    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'}
    
    base_url = [
        'https://www.chemicalbook.com/CASDetailList_{}.htm'.format(i) for i in range(0, 101, 100)
    ]
    print(base_url)
    finds1 = re.compile(r'<a class="blue" href="/CAS.*?">(.*?)</a>',re.S)
    finds2 = re.compile(r'<a class="blue" href="/ChemicalProductProperty_CN_.*">(.*?)</a>', re.S)
    finds3 = re.compile(r'<td width="380">(.*?)</td>', re.S)
    finds4 = re.compile(r'<span id="ContentPlaceHolder1_ProductClassDetail_.*">(.*?)</span>', re.S)
    
    
    
    def getData():
        datalist = []
        for url in base_url:  # 設定迴圈
    
            print('第{}頁'.format(url))
            page = requests.get(url)
            # print(page.status_code)
            soup = BeautifulSoup(page.content, 'html.parser')
            # print(soup.prettify())
            for tr in soup.find_all('tr'):
                data = []
                tr = str(tr)
                tr = re.sub('\r\n', " ", tr)  # 替換/
                s1 = re.findall(finds1, tr)
                if s1 != []:
                    data.append(s1[0])
                s2 = re.findall(finds2, tr)
                if s2 != []:
                    data.append(s2[0])
                s3 = re.findall(finds3, tr)
                if s3 != []:
                    data.append(s3[0])
                s4 = re.findall(finds4, tr)
                if s4 != []:
                    data.append(s4[0])
    
                # print(data)
                datalist.append(data)
    
        return (datalist)
    
    
    def saveData(datalist, savepath):
        print("save......")
        book = xlwt.Workbook(encoding="utf-8", style_compression=0)
        sheet = book.add_sheet('IPA', cell_overwrite_ok=True)
        col = ("CAS", "中文名", "英文名", "MF")
        for i in range(0, 4):
            sheet.write(0, i, col[i])
        for i in range(0, 32652):
            # print("第%d條" %(i+1))
            data = datalist[i]
            # print(len(data))
            if data != []:
                for j in range(0, 4):
                    sheet.write(i + 1, j, data[j])
    
        book.save(savepath)
        
    if __name__ == "__main__":
        datalist = getData()
        print(datalist)
        savepath = ".\\data1.xls"
        del(datalist[0])
        #saveData(datalist,savepath)
        print("爬取完畢")
  1. 爬取單頁資料
    # -*- coding: utf-8 -*-
    """
    Created on Wed Jul 22 09:20:31 2020
    
    @author: JX
    """
    
    import requests
    import re
    from bs4 import BeautifulSoup
    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'}
    url = "https://www.chemicalbook.com/CAS_5446-18-4.htm"
    
    #儲存資源 
    def save_contents(urlist): 
        with open("./data.txt",'a+',encoding = 'utf-8') as f: 
            for i in urlist:
                f.write(i)
            #f.write(' ')
    
    page = requests.get(url)
    newp=page.text.replace('<br />','')
    
    print(page.status_code)
    soup = BeautifulSoup(page.content, 'html.parser')
    trs = soup.find_all('div',id="ContentPlaceHolder1_SubClass")
    for tr in trs:
        for td in tr.stripped_strings:
            #print(td)
            save_contents(td)  
    
    with open('data.txt','r',encoding='utf-8') as f:        
        dic=[]
        for line in f.readlines():
            #line = str(line).replace("\n","")
            b=re.split('【',line)
            dic.append(b)
            
    dic=str(dic)
    #save_contents(str(dic))  
    dic = re.sub('】',":",dic)
    print(dic)

專案2 爬取IPA資料庫,單頁資料的獲取程式碼

    # -*- coding: utf-8 -*-
    """
    Created on Thu Jul 23 16:31:25 2020
    
    @author: JX
    """
    
    from bs4 import BeautifulSoup
    import re
    import urllib.request,urllib.error
    import xlwt
    import unicodedata
    
    finds0 = re.compile(r'<td class="tableheadbkgr">(.*?)</td>',re.S)
    finds1 = re.compile(r'<td class="b1" width="715">(.*?)</td>',re.S)
    finds2 = re.compile(r'<td class="a1" width="715">(.*?)</td>',re.S)
    finds3 = re.compile(r'<td align="left" class="b1" width="715">(.*?)</td>',re.S)
    finds4 = re.compile(r'<td align="left" class="a1" width="715">(.*?)</td>',re.S)
    
    def remove(tr):
        tr = re.sub('<br(\s+)?/>(\s+)?'," ",tr)
        tr = re.sub('<sub(\s+)?>(\s+)?'," ",tr)
        tr = re.sub('</sub(\s+)?>(\s+)?'," ",tr)
        tr = re.sub('<a.*?>'," ",tr)
        tr = re.sub('</a(\s+)?>(\s+)?'," ",tr)
        tr = re.sub('<span(\s+)?>(\s+)?'," ",tr)
        tr = re.sub('<span class="tableinstructional0">', "", tr)
        tr = re.sub('<span id="intNetworkLink"', "1", tr)
        tr = re.sub('</span(\s+)?>(\s+)?'," ",tr)
        tr = re.sub('--',"",tr)
        tr = re.sub('Interaction', "", tr)
        tr = re.sub('Network', "", tr)
        tr = re.sub('&gt;', "", tr)
        tr = re.sub('1>', "", tr)
        tr = re.sub('IPA Chem View:', "", tr)
        tr = unicodedata.normalize('NFKC', tr)
        tr = tr.replace('\n', "")
        return tr
    
    def getData(url):
        data = []
        kong = []
        fp = open(url,'r',encoding='utf-8')
        soup = BeautifulSoup(fp,'html.parser')
        res0 = str(soup.find('td', class_="tableheadbkgr"))
        res0 = remove(res0)
        s0 = re.findall(finds0, res0)
        #print(s0)
        data.append(s0)
        res = soup.find_all('table',class_="tablenodeviewcontainer")
        for tr in res:    
            tr = str(tr)
            tr = remove(tr)
            s1 = re.findall(finds1,tr)
            s1 = [[i,] for i in s1]
            if len(s1) == 8:
                for i in range(len(s1)):
                    data.append(s1[i])
            else:
                for i in range(len(s1)):
                    data.append(s1[i])
                for i in range(8-len(s1)):
                    data.append(kong)
            s2 = re.findall(finds2,tr)
            s2 = [[i,] for i in s2]
            if len(s2) == 8:
                for i in range(len(s2)):
                    data.append(s2[i])
            else:
                for i in range(len(s2)):
                    data.append(s2[i])
                for i in range(8-len(s2)):
                    data.append(kong)
            s3 = re.findall(finds3,tr)
            s3 = [[i,] for i in s3]
            if len(s3) == 2:
                for i in range(len(s3)):
                    data.append(s3[i])
            else:
                for i in range(len(s3)):
                    data.append(s3[i])
                for i in range(2-len(s3)):
                    data.append(kong)
            s4 = re.findall(finds4,tr)
            s4 = [[i, ] for i in s4]
            if len(s4) == 3:
                for i in range(len(s4)):
                    data.append(s4[i])
            else:
                for i in range(len(s4)):
                    data.append(s4[i])
                for i in range(3-len(s4)):
                    data.append(kong)
        return data
    
      
    def saveData(datalist,savepath,n):
        print("save......")
        book = xlwt.Workbook(encoding="utf-8",style_compression=0)
        sheet = book.add_sheet('IPA',cell_overwrite_ok=True)
        col = ("Symbol","Synonyms","IUPAC Name","SMILES","Chemical Formula","PubChem Link","HMDB Link","regulated by","role in cell",
        "Systematic Name","CAS Registry Number","InChI","Molecular Weight","Canonical Pathways","regulates","binds","disease",
               "Members of Subgroup","Manufacturer","Member of Groups","Brand Name","Therapeutic Categories")
        for i in range(0,len(col)):
            sheet.write(0,i,col[i])
        for i in range(0,len(datalist)):
            sheet.write(n,i,datalist[i])
        book.save(savepath)
        
    def askURL(url):
        head = {
            "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.81 Safari/537.36 SE 2.X MetaSr 1.0"
        }
        request = urllib.request.Request(url,headers = head)
        html = ""
        try:
            response = urllib.request.urlopen(request)
            html = response.read().decode("utf-8")
        except urllib.error.URLError as e:
            if hasattr(e,"code"):
                print(e.code)
            if hasattr(e,"reason"):
                print(e.reason)
        return html
    
    if __name__ == "__main__":
        savepath = ".\\aspirin.xls"
        datalist = getData("./IPA Chem View aspirin  IPA.htm")
        saveData(datalist,savepath,1)
        print("爬取完畢")

專案3 爬取chem960

  1. chem資料夾:化合物

    輸入檔案:chem960_cas_list.txt

    爬取程式碼:chemreptile.py

    輸出檔案: chem960.txt

  2. plant資料夾:植物

    輸入檔案:chem960_plant_list.txt

    爬取程式碼:plant.py

    輸出檔案: plant.txt

  3. 對應關係:

    plant-chem.txt :植物-化合物關係主表

    plant-chem1.txt :植物-化合物關係補充表

3.1 爬取化合物程式碼

    # -*- coding: utf-8 -*-
    """
    Created on Wed Jul 22 09:20:31 2020
    
    @author: JX
    """
    
    from bs4 import BeautifulSoup
    import re
    import urllib.request,urllib.error
    
    CASid = []
    CAS_file = open('./chem960_cas_list.txt', 'r', encoding='utf-8')
    for line in CAS_file.readlines():
        line = line.strip('\n')
        if line !='':
            CASid.append(line)
    
    
    finds0 = re.compile(r'<span class="font-333".*?>(.*?)</span>',re.S)
    finds1 = re.compile(r'<b class="blue-light">(.*?)</b>',re.S)
    finds2 = re.compile(r'<span class="font-666">(.*?)</span>',re.S)
    finds3 = re.compile(r'<a class="blue-light".*?>(.*?)</a>',re.S)
    finds4 = re.compile(r'<a class="blue-light" href="(.*?)".*?>.*?</a>',re.S)
    
    def remove(tr):
        tr = re.sub('<sub(\s+)?>(\s+)?'," ",tr)
        tr = re.sub('</sub(\s+)?>(\s+)?'," ",tr)
        return tr
    
    def askURL(url):
        head = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36',
                'Cookie':'Abp.Localization.CultureName=zh-CN; _tempuserid=-132410786458757023; ASP.NET_SessionId=yriedc1wg4rjus5ujo4cf3kw; __RequestVerificationToken=vmt6p0lZkJfMiYcD2fynEVcsNotQhSmkZRXfBvIbd5iJYqHXTKG5o6vBIpWEGbDa9GG852NyiEvi-0SmUMvytCMy7Dg1; Hm_lvt_e96d4d23e997677f26ac69b89fc71ec7=1596605046,1596688605; XSRF-TOKEN=PbrL6FUCtOR653DCqLq0fZy8Hutu-qnUHce1sxk2NCJpJuypiuVkSyNLa19aKLvZRatCRmy_1AAl69Rt6SowTAbFI7U1; Hm_lpvt_e96d4d23e997677f26ac69b89fc71ec7=1596688760',
        }
        request = urllib.request.Request(url,headers = head)
        html = ""
        try:
            response = urllib.request.urlopen(request)
            html = response.read().decode("utf-8")
        except urllib.error.URLError as e:
            if hasattr(e,"code"):
                print(e.code)
            if hasattr(e,"reason"):
                print(e.reason)
        return html
    
    def save_contents(urlist):
        with open("./chem960.txt",'a+',encoding = 'utf-8') as f:
                #lines="\t".join([";".join(j) for j in i ]).strip()
                f.writelines(str(urlist)+'\n')
    
    kong = []
    for url in CASid:  # 設定迴圈
        data = []
        print('第{}頁'.format(url))
        html = askURL(url)
        soup = BeautifulSoup(html, 'html.parser')
        result = soup.find_all('div',class_="l cas-inner")
        result = str(result)
        result = remove(result)
        s0 = re.findall(finds0,result)
        if s0 !=[]:
            data.append(s0[0])
            if len(s0)==1:
                data.append(kong)
            else:
                data.append(s0[1])
        s1 = re.findall(finds1,result)
        if s1 !=[]:
            data.append(s1[0])
        s2 = re.findall(finds2,result)
        if s2 !=[]:
            data.append(s2[0])
            data.append(s2[1])
        s3 = re.findall(finds3,result)
        s4 = re.findall(finds4,result)
        if s3 and s4 !=[]:
            for i in range(len(s3)):
                data.append(s3[i]+':'+s4[i])
        save_contents(data)

3.2 爬取植物程式碼

    # -*- coding: utf-8 -*-
    """
    Created on Wed Jul 22 09:20:31 2020
    
    @author: JX
    """
    
    from bs4 import BeautifulSoup
    import re
    import urllib.request,urllib.error
    import time
    import unicodedata
    import requests
    finds = re.compile(r'<span style="font-size:18px;">(.*?)</span>',re.S)
    finds0 = re.compile(r'<strong>(.*?)</strong>',re.S)
    finds1 = re.compile(r'<a href=".*?" target="_blank">(.*?)</a>',re.S)
    finds2 = re.compile(r'<span>(.*?)</span>',re.S)
    
    plantid = []
    plant_file = open('./test.txt', 'r', encoding='utf-8')
    for line in plant_file.readlines():
        line = line.strip('\n')
        if line !='':
            plantid.append(line)
    def remove(tr):
        tr = re.sub('<sub(\s+)?>(\s+)?'," ",tr)
        tr = re.sub('</sub(\s+)?>(\s+)?'," ",tr)
        tr = re.sub('(', " ", tr)
        tr = re.sub(')', " ", tr)
        #tr = re.sub('<span style="font-size:18px;">(\s+)?'," ",tr)
        #tr = re.sub('</span(\s+)?>(\s+)?'," ",tr)
        tr = unicodedata.normalize('NFKC', tr)
        return tr
    
    def askURL(url):
        head = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36',
                'Cookie':'Abp.Localization.CultureName=zh-CN; _tempuserid=-132410786458757023; ASP.NET_SessionId=yriedc1wg4rjus5ujo4cf3kw; __RequestVerificationToken=vmt6p0lZkJfMiYcD2fynEVcsNotQhSmkZRXfBvIbd5iJYqHXTKG5o6vBIpWEGbDa9GG852NyiEvi-0SmUMvytCMy7Dg1; Hm_lvt_e96d4d23e997677f26ac69b89fc71ec7=1596605046,1596688605; XSRF-TOKEN=2a7785UE6WhyURBYISJQHFzhMqcSY6JMFHtCwGQfg15GcjCsMgzEzsaC7fA2TMhGCHVdezeiYMJZ3yX7ASYSxGRbzJ41; Hm_lpvt_e96d4d23e997677f26ac69b89fc71ec7=1596690112',
        }
        request = urllib.request.Request(url,headers = head)
        html = ""
        try:
            response = urllib.request.urlopen(request)
            html = response.read().decode("utf-8")
        except urllib.error.URLError as e:
            if hasattr(e,"code"):
                print(e.code)
            if hasattr(e,"reason"):
                print(e.reason)
        return html
    #url="https://www.chem960.com/unpd/s475"
    datalist = []
    def getData():
        for url in plantid:  # 設定迴圈
            data =[]
            data1 = []
            print('第{}頁'.format(url))
            html = askURL(url)
            soup = BeautifulSoup(html, 'html.parser')
            name = soup.find_all('div',class_="nright")
            name = str(name)
            name = remove(name)
            #name = re.sub('</span(\s+)?>(\s+)?', "", name)
            s0 = re.findall(finds0,name)
            if s0 != []:
                data.append(s0)
            else:
                s = re.findall(finds, name)
                data.append(s)
            result = soup.find_all('td', class_="natural-hhw-list")
            result = str(result)
            s1 = re.findall(finds1, result)
            if s1 != []:
                result1 = soup.find_all('div', class_="casno")
                result1 = str(result1)
                result1 = remove(result1)
            s2 = re.findall(finds2, result1)
            if s2 != []:
                num = int(len(s2)/3)
                a = int(len(s2) / num)
                b = int(len(s2) / num) + 1
                for i in range(1, len(s2)+1, int(len(s2)/num)):
                    s1[a] = s2[i]
                    s1[b] = s2[i+1]
                    a += int(len(s1)/num)
                    b += int(len(s1)/num)
                for i in range(0, len(s1), int(len(s1)/num)):
                    b = s1[i:i + int(len(s1)/(num))]
                    data.append(b)
                for i in range(num+1):
                    if i!=0:
                        data1.append(data[0])
                        data1.append(data[i])
            datalist.append(data1)
        return datalist
    
    
    def save_contents(urlist):
        with open("./plant.txt",'w',encoding = 'utf-8') as f:
            n=0
            for i in urlist:
                for j in i:
                    line = ";".join(j).strip()
                    f.write(str(line))
                    f.write('\t')
                    n += 1
                    if n%2==0:
                        f.write('\n')
                #lines = "\t".join([";".join(j) for j in i]).strip()
                #f.write(lines)
    
    if __name__ == "__main__":
        datalist = getData()
        save_contents(datalist)
    

3.3 爬取植物介紹

    # -*- coding: utf-8 -*-
    """
    Created on Wed Jul 22 09:20:31 2020
    
    @author: JX
    """
    
    from bs4 import BeautifulSoup
    import re
    import urllib.request,urllib.error
    
    plantid = []
    plant_file = open('./test.txt', 'r', encoding='utf-8')
    for line in plant_file.readlines():
        line = line.strip('\n')
        if line !='':
            plantid.append(line)
    
    def remove1(jieshao):
        jieshao = re.sub('<br/></br></blockquote>', "", str(jieshao))
        jieshao = re.sub('<blockquote.*?<br>', "", str(jieshao))
        jieshao = re.sub('<blockquote.*?>', "", str(jieshao))
        jieshao = re.sub('<blockquote.*?<p>', "", str(jieshao))
        jieshao = re.sub('<strong.*?</strong>', "", str(jieshao))
        jieshao = re.sub('<p.*?>', "", str(jieshao))
        jieshao = re.sub('</p>', "", str(jieshao))
        jieshao = re.sub('</blockquote>', "", str(jieshao))
        jieshao = re.sub('<br/>', "", str(jieshao))
        jieshao = re.sub('<span.*?>(\s+)?', "", str(jieshao))
        jieshao = re.sub('</span(\s+)?>(\s+)?', "", str(jieshao))
        jieshao = re.sub('<div.*?>', "", str(jieshao))
        jieshao = re.sub('</div>', "", str(jieshao))
        jieshao = re.sub('<br>', "", str(jieshao))
        jieshao = re.sub('</br>', "", str(jieshao))
        jieshao = re.sub('<a></a>', "", str(jieshao))
        jieshao = re.sub('\s+', '', str(jieshao)).strip()
        return jieshao
    
    def askURL(url):
        head = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36',
                'Cookie':'Abp.Localization.CultureName=zh-CN; _tempuserid=-132410786458757023; ASP.NET_SessionId=yriedc1wg4rjus5ujo4cf3kw; __RequestVerificationToken=vmt6p0lZkJfMiYcD2fynEVcsNotQhSmkZRXfBvIbd5iJYqHXTKG5o6vBIpWEGbDa9GG852NyiEvi-0SmUMvytCMy7Dg1; Hm_lvt_e96d4d23e997677f26ac69b89fc71ec7=1596605046,1596688605; XSRF-TOKEN=2a7785UE6WhyURBYISJQHFzhMqcSY6JMFHtCwGQfg15GcjCsMgzEzsaC7fA2TMhGCHVdezeiYMJZ3yX7ASYSxGRbzJ41; Hm_lpvt_e96d4d23e997677f26ac69b89fc71ec7=1596690112',
        }
        request = urllib.request.Request(url,headers = head)
        html = ""
        try:
            response = urllib.request.urlopen(request)
            html = response.read().decode("utf-8")
        except urllib.error.URLError as e:
            if hasattr(e,"code"):
                print(e.code)
            if hasattr(e,"reason"):
                print(e.reason)
        return html
    #url="https://www.chem960.com/unpd/s475"
    datalist = []
    def save_contents(urlist):
        with open("./jieshao.txt",'a+',encoding = 'utf-8') as f:
                lines="".join(urlist).strip()
                for i in range(num):
                    f.writelines(lines+'\n')
    for url in plantid:  # 設定迴圈
        data =[]
        print('第{}頁'.format(url))
        html = askURL(url)
        soup = BeautifulSoup(html, 'html.parser')
        result1 = soup.find_all('div', class_="casno")
        result1 = str(result1)
        finds2 = re.compile(r'<span>(.*?)</span>', re.S)
        s2 = re.findall(finds2, result1)
        if s2 != []:
            num = int(len(s2) / 3)
        print(num)
        jieshao = soup.find('blockquote', class_="layui-elem-quote maigin-top-5")
        jieshao = remove1(jieshao)
        if jieshao==None:
            data.append([])
        else:
            data.append(jieshao.strip())
        save_contents(data)

專案4 爬取植物資料庫

  1. cookie資料夾:測試獲取cookie的方法

    cookie.py requests方法

    ip.py ip代理池方法

    selenium.py selenium方法

    session.py session方法

  2. plantname資料夾:獲取植物名字

    輸入檔案:MF.txt

    爬取程式碼:formdate.py

    輸出檔案: plantname.txt

    去重程式碼:delete.py

  3. 植物-化合物對應關係:

    plant-chem.py :植物-化合物程式碼

4.1 爬取植物名字

   import requests
    from bs4 import BeautifulSoup
    import re
    import urllib.request
    import time
    import json
    import http.cookiejar as cj
    from itertools import islice
    
    def getMF():
        MFlist = []
        MF_file = open('./MF1.txt', 'r', encoding='utf-8')
        for line in MF_file.readlines():
            line = line.strip('\n')
            if line !='':
                MFlist.append(line)
        return MFlist
    def removes(dd):
        dd = re.sub("; \('", "\t", dd)
        dd = re.sub('&amp;', "&", dd)
        dd = re.sub('"', "", dd)
        dd = re.sub("'\)", "\t", dd)
        dd = re.sub("', '", "sss666", dd)
        dd = re.sub("sss666", "\t", dd)
        dd = re.sub("',", "", dd)
        return dd
    n=1
    cw=1
    
    def search(MF):
    
        url = 'http://chemdb.sgst.cn/ssdb/plant/plant_R2.asp'
        #formdata = {'tid':'238620', 'm_PlantID':'3','pageno':'99'}
        jiansuo = {'MF':MF, 'flag':'0','Page':'198'}
        head = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36',
                    'Cookie':'ASPSESSIONIDSQSCACAD=NIPBNOPACHLJKHFPHGLMDFPK',
                    'Origin': 'http://chemdb.sgst.cn',
                    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
                    'Accept-Encoding': 'gzip, deflate',
                    'Accept-Language': 'zh-CN,zh-TW;q=0.9,zh;q=0.8,en-US;q=0.7,en;q=0.6',
                    'Cache-Control': 'max-age=0',
                    'Connection': 'keep-alive',
                    'Content-Length': '86',
                    'Content-Type': 'application/x-www-form-urlencoded',
            }
        try:
            r = requests.post(url, data=jiansuo,headers= head)
            r.raise_for_status()
            r.encoding = "GBK"
            print('連結成功')
    
            return r.text
        except urllib.error.URLError as e:
    
            if hasattr(e, "code"):
                print(e.code)
            if hasattr(e, "reason"):
                print(e.reason)
    
    for MF in getMF():
        finds0 = re.compile(r'<font color="red">(.*?)</font>',re.S)
        finds1 = re.compile(r'<a href="plant_descript.asp(.*?)>(.*?)</a>',re.S)
        data=[]
        time.sleep(3)
        s = search(MF)
        soup = BeautifulSoup(s, 'html.parser')
        fcuo = re.compile(r'404', re.S)
        cuowu = re.findall(fcuo, str(s))
        if cuowu != []:
            cw += 1
        if cw == 10:
            path = './MF1.txt'
            path_out = './MF_out.txt'
            with open(path) as f, open(path_out, 'w+') as f_out:
                for a in islice(f, n, 100000, 1):
                    f_out.write(a)
            cw=1
            break
        print(cw)
        #print('第{}條'.format(n))
        n+=1
        table = soup.find('div')
        table = str(table)
        s0 = re.findall(finds0,table)
        s1 = re.findall(finds1,table)
        if len(s1) != 0:
            for i in range(len(s1)):
                with open("./form.txt",'a+',encoding = 'utf-8') as f:
                    dd = s0[1] + str(s1[i])
                    dd = removes(dd)
                    f.writelines(dd + '\n')
        else:
            with open("./form.txt", 'a+', encoding='utf-8') as f:
                f.writelines(str(MF) + '\t'+'None'+'\n')

4.2 爬取植物—化合物

   # -*- coding: utf-8 -*-
    """
    Created on Thu Aug 20 10:27:44 2020
    
    @author: JX
    """
    import time
    import requests
    from bs4 import BeautifulSoup
    import re
    import random
    
    def plant():
        url = 'http://chemdb.sgst.cn/ssdb/plant/Species_des.asp'
        formdata = {'m_PlantID':'1', 'tid':'242263'}
        head = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3775.400 QQBrowser/10.6.4208.400',
                    'Cookie':'ASPSESSIONIDQQTADDAD=LLNHIEEDONIDKJOIHOFPNEEH; __jsluid_h=dada7ae00f096479097a121016cd042c',
                    'Host': 'www.organchem.csdb.cn',
                    'Upgrade-Insecure-Requests':'1',
                    'Referer':'http://www.organchem.csdb.cn/scdb/plant/plant_R.asp',
                    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
                    'Accept-Encoding': 'gzip, deflate',
                    'Accept-Language': 'zh-CN,zh;q=0.9',
                    'Cache-Control': 'max-age=0',
                    'Connection': 'keep-alive',
                    'Content-Length': '86',
                    'Content-Type': 'application/x-www-form-urlencoded',
            }
        try:
            r = requests.post(url, data=formdata,headers= head)
            time.sleep(random.random() * 3)
            r.raise_for_status()
            r.encoding = "GBK"
            print('連結成功')
        except:
            print('連結失敗')
         
        data = []
        kong = []
        #fp = open(url,'r',encoding='GBK')
        soup = BeautifulSoup(r.text,'html.parser')
        finds0 = re.compile(r'<a href="(.*?)">.*?</a>', re.S)
        name = soup.find('td', class_='content_project3')
        with open("./plant.txt", 'a+', encoding='utf-8') as f:
            if name!=None:
                name = name.get_text()
                name = re.sub('\s+', '', str(name)).strip()
                f.writelines("物種: "+name +'\n')
            xuename = soup.find('td', class_='content_project1')
            if xuename!=None:
                xuename = xuename.get_text()
                xuename = re.sub('\s+', '', str(xuename)).strip()
                f.writelines("學名: "+xuename +'\n')
            miaoshu = soup.select('table[class="table2"] td[class="content_project1"] p')
            if len(miaoshu):
                for c in miaoshu:
                    #print("植物描述: "+c.text.strip())
                    f.writelines("植物描述: "+c.text.strip() +'\n')
            else:
                title = soup.select('td[class="title_project1"]')
                find = re.compile(r'植物描述', re.S)
                title = re.findall(find, str(title))
                if '植物描述' in title:
                    zi  = soup.select('table[class="table2"] td[class="content_project1"]')
                    finds1 = re.compile(r'<td class="content_project1">.*?<td class="content_project1">(.*?)</td>.*?', re.S)
                    miaoshu1 = re.findall(finds1, str(zi))
                    for c in miaoshu1:
                        f.writelines("植物描述: "+c.strip() +'\n')
                        #print("植物描述: "+c.strip())
                else:
                    print("植物描述空")
            fenlei = soup.select('a[href*="_des.asp"]')
            for c in fenlei:
                f.writelines("植物分類資訊: "+ c.text.strip() +'\n')
                #print("植物分類資訊: "+ c.text.strip())
            
            species = soup.select('a[href*="Species_compounds"]')
            species = re.findall(finds0, str(species))
            species = re.sub('&amp;', "&", str(species))
            #print("化學成分: "+species)
            f.writelines("化學成分: "+species +'\n')
            
            finds3 = re.compile(r'<td class="content_project1" title="植物的.*?">(.*?)</td>.*?', re.S)
            disease = re.findall(finds3, str(soup))
            for d in disease:
                f.writelines("可能有效疾病: "+ d.strip() +'\n')
                #print("可能有效疾病: "+ d.strip())
            
            image = soup.select('div[class="jqzoom"] img')
            finds4 = re.compile(r'<img.*?jqimg="(.*?)".*?/>', re.S)
            image = re.findall(finds4, str(image))
            for i in image:
                f.writelines("植物圖片: "+ i.strip() +'\n')
                #print("植物圖片: "+ i.strip())
            
            plantlit = soup.select('a[href^="plant_lit"]')
            for c in plantlit:
                f.writelines("研究文獻: "+c.text.strip()+'\n')
                #print("研究文獻: "+c.text.strip())
             #stripped_strings
            cankao = soup.select("td:nth-of-type(25)")
            for c in cankao:
                f.writelines("命名參考: "+c.text.strip() +'\n')
                #print("命名參考: "+c.text.strip())
                
    plant()
    '''  
    def getpage():
        url = 'http://www.organchem.csdb.cn/scdb/plant/Species_compounds.asp'  
        formdata = {'m_PlantID':'1', 'tid':'242236'}
        head = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36',
                    'Cookie':'ASPSESSIONIDSSRBABDC=MEAGDGECDGEALGILLHLAwwwDOHB; Hm_lvt_2d9981563033253e57169917f0e2a4d5=1597824270; Hm_lpvt_2d9981563033253e57169917f0e2a4d5=1597824270; ASPSESSIONIDSQTCCDAD=NJJPDIHCEMOLBEKNOOFABFAM',
                    'Host': 'www.organchem.csdb.cn',
                    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
                    'Accept-Encoding': 'gzip, deflate',
                    'Accept-Language': 'zh-CN,zh-TW;q=0.9,zh;q=0.8,en-US;q=0.7,en;q=0.6',
                    'Cache-Control': 'max-age=0',
                    'Connection': 'keep-alive',
                    'Content-Length': '86',
                    'Content-Type': 'application/x-www-form-urlencoded',
            }
        try:
            r = requests.post(url, data=formdata,headers= head)
            r.raise_for_status()
            r.encoding = "GBK"
        except:
            print('連結失敗')
        
        soup = BeautifulSoup(r.text,'html.parser')
        findpage = re.compile(r'當前頁碼:1/(.*?),10條紀錄/頁', re.S)
        page = re.findall(findpage, str(soup))
        page = int(page[0])
    '''
    def srn():
        url = 'http://chemdb.sgst.cn/ssdb/plant/Species_compounds.asp?'
        for pn in range(1,66):
            print(pn)    
            formdata = {'m_PlantID':'1', 'tid':'242263','pageno':pn}
            head = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3775.400 QQBrowser/10.6.4208.400',
                    'Cookie':'ASPSESSIONIDQQTADDAD=LLNHIEEDONIDKJOIHOFPNEEH; __jsluid_h=dada7ae00f096479097a121016cd042c',
                    'Host': 'www.organchem.csdb.cn',
                    'Upgrade-Insecure-Requests':'1',
                    'Referer':'http://www.organchem.csdb.cn/scdb/plant/plant_R.asp',
                    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
                    'Accept-Encoding': 'gzip, deflate',
                    'Accept-Language': 'zh-CN,zh;q=0.9',
                    'Cache-Control': 'max-age=0',
                    'Connection': 'keep-alive',
                    'Content-Length': '86',
                    'Content-Type': 'application/x-www-form-urlencoded',
            }
            try:
                r = requests.post(url, data=formdata,headers= head)
                time.sleep(random.random() * 3)
                r.raise_for_status()
                r.encoding = "GBK"
                print('連結成功')
            except:
                print('連結失敗')
            
            soup = BeautifulSoup(r.text,'html.parser')
            srn = soup.select('a[href*="str_cas_property.asp"]')
            
            name = soup.find('td', class_='content_project1')
            if name!=None:
                name = name.get_text()
                name = re.sub('\s+', '', str(name)).strip()
            for s in srn:
                print(name+": "+s.text.strip())
                num = s.text.strip()
                url1 = 'http://chemdb.sgst.cn/ssdb/str/str_cas_property.asp'
                formdata1 = {'srn':num}
                head1 = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3775.400 QQBrowser/10.6.4208.400',
                    'Cookie':'ASPSESSIONIDQQTADDAD=LLNHIEEDONIDKJOIHOFPNEEH; __jsluid_h=dada7ae00f096479097a121016cd042c',
                    'Host': 'www.organchem.csdb.cn',
                    'Upgrade-Insecure-Requests':'1',
                    'Referer':'http://www.organchem.csdb.cn/scdb/plant/plant_R.asp',
                    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
                    'Accept-Encoding': 'gzip, deflate',
                    'Accept-Language': 'zh-CN,zh;q=0.9',
                    'Cache-Control': 'max-age=0',
                    'Connection': 'keep-alive',
                    'Content-Length': '86',
                    'Content-Type': 'application/x-www-form-urlencoded',
                }
                try:
                    r = requests.post(url1,headers= head1,data=formdata1)
                    time.sleep(random.random() * 3)
                    r.raise_for_status()
                    r.encoding = "GBK"
                except:
                    print('連結失敗')
                soup = BeautifulSoup(r.text,'html.parser')
                rowline = [row.get_text(strip=True) for index, row in enumerate(soup.find_all("tr"), start=1) if index <=4] #index % 2 == 0])
                with open("./chemtest.txt", 'a+', encoding='utf-8') as f:
                            f.writelines(name+": "+s.text.strip() + '\t' +str(rowline) +'\n')
    
    
    srn()

相關文章