使用Python抓取簡述首頁標題即詳情頁資訊
"""
get page_data of 'JianShu.com'.(rewrite)
"""
import re
from lxml import etree
import requests
from bs4 import BeautifulSoup
import json
class PageTo:
def __init__(self, url):
self.url = url
self.header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
}
self.main_page = requests.get(self.url, headers=self.header)
self.csrf_token = re.findall('<meta name="csrf-token" content="(.*?)" />', self.main_page.text)[0]
self.note_id = re.findall('<li id=".*?" data-note-id="(.*?)"', self.main_page.text, re.S)
print("第1頁")
self.page_parse(self.main_page.text)
def get_ajax_page(self, start_page, end_page):
ajax_header = self.header.copy()
ajax_header.update({
'X-INFINITESCROLL': 'true',
'X-CSRF-Token': self.csrf_token
})
for page in range(start_page, end_page + 1):
if page < end_page + 1:
print(f"第{page}頁")
params = {'seen_snote_ids[]': self.note_id}
ajax_page = requests.get(self.url, params=params, headers=ajax_header)
self.page_parse(ajax_page.text)
self.note_id += re.findall('data-note-id="(.*?)', ajax_page.text, re.S)
else:
print("It's over.")
break
def page_parse(self, page):
article_title_link = re.findall('<a class="title" target="_blank" href="(.*?)">(.*?)</a>', page, re.S)
article_abstract = re.findall(' <p class="abstract">(.*?)</p>', page, re.S)
article_header = self.header.copy()
article_header.update({
'Upgrade-Insecure-Requests': '1'
})
for info in range(len(article_abstract)):
print(article_title_link[info][1])
print(article_abstract[info])
article_page = requests.get(f'{self.url}{article_title_link[info][0]}', headers=article_header).text
self.article_parse(article_page)
def article_parse(self, article_page):
soup = BeautifulSoup(article_page, 'lxml')
article_info = soup.find('div', class_='article')
author = article_info.find('div', class_='author').find('span', class_='name').get_text()
author_info = json.loads(soup.find('script', type='application/json').get_text())
article = article_info.find_all('p')
print('author:', author)
print(f'"likes_count":{author_info["note"]["likes_count"]},'
f'"views_count":{author_info["note"]["views_count"]},'
f'"public_wordage":{author_info["note"]["public_wordage"]},'
f'"comments_count":{author_info["note"]["comments_count"]}')
for info in range(len(article)):
print(article[info].get_text())
print("\n")
if __name__ == '__main__':
JianShu = PageTo('https://www.jianshu.com')
JianShu.get_ajax_page(2, 3)