python簡書資料抓取

One of them發表於2018-08-25

使用Python抓取簡述首頁標題即詳情頁資訊

"""
   get page_data of 'JianShu.com'.(rewrite)
"""
import re
from lxml import etree
import requests
from bs4 import BeautifulSoup
import json


class PageTo:
    def __init__(self, url):
        self.url = url
        self.header = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
        }

        self.main_page = requests.get(self.url, headers=self.header)
        self.csrf_token = re.findall('<meta name="csrf-token" content="(.*?)" />', self.main_page.text)[0]
        self.note_id = re.findall('<li id=".*?" data-note-id="(.*?)"', self.main_page.text, re.S)
        print("第1頁")
        self.page_parse(self.main_page.text)

    def get_ajax_page(self, start_page, end_page):
        ajax_header = self.header.copy()
        ajax_header.update({
            'X-INFINITESCROLL': 'true',
            'X-CSRF-Token': self.csrf_token
        })

        for page in range(start_page, end_page + 1):
            if page < end_page + 1:
                print(f"第{page}頁")
                params = {'seen_snote_ids[]': self.note_id}
                ajax_page = requests.get(self.url, params=params, headers=ajax_header)
                self.page_parse(ajax_page.text)
                self.note_id += re.findall('data-note-id="(.*?)', ajax_page.text, re.S)
            else:
                print("It's over.")
                break

    def page_parse(self, page):
        article_title_link = re.findall('<a class="title" target="_blank" href="(.*?)">(.*?)</a>', page, re.S)
        article_abstract = re.findall(' <p class="abstract">(.*?)</p>', page, re.S)

        article_header = self.header.copy()
        article_header.update({
            'Upgrade-Insecure-Requests': '1'
        })
        for info in range(len(article_abstract)):
            print(article_title_link[info][1])
            print(article_abstract[info])

            article_page = requests.get(f'{self.url}{article_title_link[info][0]}', headers=article_header).text
            self.article_parse(article_page)

    def article_parse(self, article_page):
        soup = BeautifulSoup(article_page, 'lxml')
        article_info = soup.find('div', class_='article')
        author = article_info.find('div', class_='author').find('span', class_='name').get_text()
        author_info = json.loads(soup.find('script', type='application/json').get_text())
        article = article_info.find_all('p')
        print('author:', author)
        print(f'"likes_count":{author_info["note"]["likes_count"]},'
              f'"views_count":{author_info["note"]["views_count"]},'
              f'"public_wordage":{author_info["note"]["public_wordage"]},'
              f'"comments_count":{author_info["note"]["comments_count"]}')
        for info in range(len(article)):
            print(article[info].get_text())
        print("\n")


if __name__ == '__main__':
    JianShu = PageTo('https://www.jianshu.com')
    JianShu.get_ajax_page(2, 3)

相關文章