簡單例子展示爬蟲在不同思想下的寫法

專注的阿熊發表於2021-04-26

1. 普通寫法

from urllib import parse

from urllib import request

name = input(' 選擇您要檢視的主題: ')

start = int(input(' 選擇起始頁: '))

end = int(input(' 選擇結束頁: '))

headers = {

     "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "

                   "AppleWebKit/537.36 (KHTML, like Gecko)"

                   " Chrome/90.0.4430.85 Safari/537.36 Edg/90.0.818.46"

}

base_url = '

for i in range(start, end + 1):

     num = (i - 1) * 50

     url = base_url + parse.quote(name) + '&ie=utf-8&pn=' + str(num)

     req = request.Request(url, headers=headers)

     response = request.urlopen(req)

     html = response.read().decode('utf-8')

     file_name = ' ' + str(i) + ' 頁內容 .html'

     with open(file_name, 'w', encoding='utf-8') as file_obj:

         print(' 正在爬取第 %d ' % i)

         file_obj.write(html)

2. 函式式寫法

from urllib import parse

from urllib import request

# 獲取資料

def read_url(url):

     headers = {

         "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "

                       "AppleWebKit/537.36 "

                       "(KHTML, like Gecko)"

                       " Chrome/90.0.4430.85 Safari/537.36 Edg/90.0.818.46"

     }

     req = request.Request(url, headers=headers)

     response = request.urlopen(req)

     html = response.read().decode('utf-8')

     return html

# 寫入資料

def write_page(file_name, html):

     with open(file_name, 'w', encoding='utf-8') as file_obj:

         file_obj.write(html)

         print(" 寫入成功 ")

# 主函式,其他的都寫入其中

def main():

     name = input('外匯跟單gendan5.com 選擇您要檢視的主題: ')

     start = int(input(' 選擇起始頁: '))

     end = int(input(' 選擇結束頁: '))

     base_url = '

     for i in range(start, end + 1):

         num = (i - 1) * 50

         url = base_url + parse.quote(name) + '&ie=utf-8&pn=' + str(num)

         file_name = ' ' + str(i) + ' 頁內容 .html'

         html = read_url(url)

         write_page(file_name, html)

if __name__ == '__main__':

     main()

3. 物件導向寫法

from urllib import parse

from urllib import request

class BaiduSpider:

     def __init__(self):

         self.headers = {

             "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "

                           "AppleWebKit/537.36 (KHTML, like Gecko)"

                           " Chrome/90.0.4430.85 Safari/537.36 Edg/90.0.818.46"

         }

         self.base_url = '

     def read_page(self, url):

         req = request.Request(url, headers=self.headers)

         response = request.urlopen(req)

         html = response.read().decode('utf-8')

         return html

     def write_page(self, file_name, html):

         with open(file_name, 'w', encoding='utf-8') as file_obj:

             file_obj.write(html)

             print(" 寫入成功 ")

     def main(self):

         name = input(' 選擇您要檢視的主題: ')

         start = int(input(' 選擇起始頁: '))

         end = int(input(' 選擇結束頁: '))

         for i in range(start, end + 1):

             num = (i - 1) * 50

             url = self.base_url + parse.quote(name) + '&ie=utf-8&pn=' + str(num)

             file_name = ' ' + str(i) + ' 頁內容 .html'

             html = self.read_page(url)

             self.write_page(file_name, html)

if __name__ == '__main__':

     yes = BaiduSpider()

     yes.main()


來自 “ ITPUB部落格 ” ,連結:http://blog.itpub.net/69946337/viewspace-2769972/,如需轉載,請註明出處,否則將追究法律責任。

相關文章