北京市政百姓信件分析實戰一 (利用python爬取資料)

财神给你送元宝發表於2024-09-06

因為我的python版本為3.12

所以安裝一些軟體包命令 與之前有些許不同

pip install beautifulSoup4

pip install demjson3

pip install requests

話不多說 程式碼奉上

import json

import demjson3
import requests
from bs4 import BeautifulSoup
import csv

headers = {
    'Host': 'www.beijing.gov.cn',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:71.0) Gecko/20100101 Firefox/71.0',
    'Accept': 'application/json, text/javascript, */*; q=0.01',
    'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
    'Accept-Encoding': 'gzip, deflate',
    'Content-Type': 'text/json',
    'X-Requested-With': 'XMLHttpRequest',
    'Content-Length': '155',
    'Origin': 'http://www.beijing.gov.cn',
    'Connection': 'keep-alive',
    'Referer': 'http://www.beijing.gov.cn/hudong/hdjl/'
}

if __name__ == "__main__":
    page = 1
    datas = json.dumps({})

    while page < 175:
        print(page)
        url = f"https://www.beijing.gov.cn/hudong/hdjl/sindex/bjah-index-hdjl!replyLetterListJson.action?page.pageNo={page}&page.pageSize=6&orgtitleLength=26"
        r = requests.post(url, data=datas, headers=headers)
        rr = demjson3.decode(r.text);


        for item in rr.get("result", []):
            originalId = item.get("originalId")  # 編號
            letterTypeName = item.get("letterTypeName")  # 信件型別

            # 構建詳情頁URL
            detail_url = f"http://www.beijing.gov.cn/hudong/hdjl/com.web.{('consult' if letterTypeName == '諮詢' else 'suggest')}.{('consultDetail' if letterTypeName == '諮詢' else 'suggesDetail')}.flow?originalId={originalId}"
            r1 = requests.get(detail_url, headers={'user-agent': 'Mozilla/5.0'})

            if r1.status_code == 200:
                demo = r1.text
                soup = BeautifulSoup(demo, "html.parser")

                title = soup.find("strong").get_text().replace("\n", "") if soup.find("strong") else ""
                fromPeople = soup.find_all("div", {"class": "col-xs-10 col-lg-3 col-sm-3 col-md-4 text-muted"})[0].get_text().lstrip('來信人:').lstrip().rstrip() if soup.find_all("div", {"class": "col-xs-10 col-lg-3 col-sm-3 col-md-4 text-muted"}) else ""
                fromTime = soup.find_all("div", {"class": "col-xs-5 col-lg-3 col-sm-3 col-md-3 text-muted"})[0].get_text().lstrip('時間:') if soup.find_all("div", {"class": "col-xs-5 col-lg-3 col-sm-3 col-md-3 text-muted"}) else ""
                problem = soup.find_all("div", {"class": "col-xs-12 col-md-12 column p-2 text-muted mx-2"})[0].get_text().lstrip().rstrip().replace("\r", "").replace("\n", "") if soup.find_all("div", {"class", "col-xs-12 col-md-12 column p-2 text-muted mx-2"}) else ""
                office = soup.find_all("div", {"class": "col-xs-9 col-sm-7 col-md-5 o-font4 my-2"})[0].get_text().replace("\n", "") if soup.find_all("div", {"class": "col-xs-9 col-sm-7 col-md-5 o-font4 my-2"}) else ""
                answerTime = soup.find_all("div", {"class": "col-xs-12 col-sm-3 col-md-3 my-2"})[0].get_text().lstrip('答覆時間:') if soup.find_all("div", {"class": "col-xs-12 col-sm-3 col-md-3 my-2"}) else ""
                answer = soup.find_all("div", {"class": "col-xs-12 col-md-12 column p-4 text-muted my-3"})[0].get_text().lstrip().rstrip().replace("\n", "").replace("\r", "") if soup.find_all("div", {"class": "col-xs-12 col-md-12 column p-4 text-muted my-3"}) else ""

                itemm = f"{originalId}|{letterTypeName}|{title}|{fromPeople}|{fromTime}|{problem}|{office}|{answerTime}|{answer}"

                with open("yijian.txt", 'a', encoding='utf-8') as fp:
                    fp.write(itemm + '\n')
            else:
                print(f"Failed to retrieve details for ID: {originalId}")

        page += 1

  

相關文章