# -*- coding:utf-8 -*-
# 字型檔案處理網站: https://font.qqe2.com/index-en.html
"""
任務:
先處理字型檔案
從他的請求當中獲取到當前請求附帶的字型檔案
爬蟲每次執行,獲取的資料都是最新的--獲取最新的請求--獲取最新的字型檔案
固定的請求引數會過期
1、時間戳
2、index怎麼來的
3、signkey
md5加密後的結果
網頁資料每次變動(請求了一下)
1、每次都會出現一個新的請求,請求當中會生產一個signkey
2、如果剛剛的signKey: f是生成signkey的程式碼:
在這個位置打上斷點
瀏覽器在下一次請求的時候會執行signKey: f
當執行到這段程式碼
signkey後面的值就是 f
f又是什麼?
"""
import io
import re
import json
import time
import random
import hashlib
# pip install ddddocr
import ddddocr
import requests
# pip install pillow==9.4.0
from PIL import Image, ImageDraw, ImageFont
from urllib.parse import urlencode
# pip install fonttools
from fontTools.ttLib import TTFont
class Movie_Data(object):
def __init__(self):
self.ocr = ddddocr.DdddOcr()
self.url = "https://piaofang.maoyan.com/dashboard-ajax?"
self.timestamp = int(time.time()*1000)
self.index = int(1000 * random.random() + 1)
self.content = f"method=GET&timeStamp={self.timestamp}&User-Agent=TW96aWxsYS81LjAgKFdpbmRvd3MgTlQgMTAuMDsgV2luNjQ7IHg2NCkgQXBwbGVXZWJLaXQvNTM3LjM2IChLSFRNTCwgbGlrZSBHZWNrbykgQ2hyb21lLzEyNC4wLjAuMCBTYWZhcmkvNTM3LjM2IEVkZy8xMjQuMC4wLjA=&index={self.index}&channelId=40009&sVersion=2&key=A013F70DB97834C0A5492378BD76C53A"
self.signkey = hashlib.md5(self.content.encode()).hexdigest()
self.headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36 Edg/124.0.0.0"
}
def parse_data_index(self):
params = {
"orderType": "0",
"uuid": "18affa452e4c8-057e2dc1cfbe0c-78505771-384000-18affa452e55",
"timeStamp": self.timestamp,
"User-Agent": "TW96aWxsYS81LjAgKFdpbmRvd3MgTlQgMTAuMDsgV2luNjQ7IHg2NCkgQXBwbGVXZWJLaXQvNTM3LjM2IChLSFRNTCwgbGlrZSBHZWNrbykgQ2hyb21lLzEyNC4wLjAuMCBTYWZhcmkvNTM3LjM2IEVkZy8xMjQuMC4wLjA=",
"index": self.index,
"channelId": "40009",
"sVersion": "2",
"signKey": self.signkey
}
url = self.url + urlencode(params)
response = requests.get(url, headers=self.headers)
return response.text
def get_font_index(self, response):
dict_data = json.loads(response)["fontStyle"]
font_url = "https:" + re.search(r'opentype"\),url\("(.*?)"\);}',dict_data).group(1)
resp = requests.get(font_url)
with open("movie.woff", "wb") as file:
file.write(resp.content)
tfont = TTFont("movie.woff")
font_list = tfont.getGlyphOrder()[2:]
return font_list
def parse_font_index(self, font_list):
charlist = []
# 載入字型檔案
font = ImageFont.truetype("movie.woff", 40)
for uchar in font_list:
uniknow_char = f"\\u{uchar[3:]}".encode().decode("unicode_escape")
im = Image.new(mode="RGB", size=(42, 40), color="white")
draw = ImageDraw.Draw(im=im)
draw.text(xy=(0, 0), text=uniknow_char, fill=0, font=font)
img_byte = io.BytesIO()
im.save(img_byte, format="JPEG")
charlist.append(self.ocr.classification(img_byte.getvalue()))
return charlist
def font_replace(self, response, old_font_list, new_font_list):
font_dict = {}
for font in list(zip(new_font_list, old_font_list)):
font_dict[font[0]] = font[1].lower()
resp = response.replace("&#x", "uni").replace(";","")
for num, code in font_dict.items():
resp = re.sub(code, str(num), resp)
data_list = json.loads(resp)["movieList"]["data"]["list"]
for data in data_list:
title = data["movieInfo"]["movieName"]
price = data["boxSplitUnit"]["num"]
print(f"電影名稱---{title}, 電影票房---{price}")
def main(self):
response = self.parse_data_index()
old_font_list = self.get_font_index(response)
# print(f"識別之前:{old_font_list}")
new_font_list = self.parse_font_index(old_font_list)
# print(f"識別之後:{new_font_list}")
self.font_replace(response=response, old_font_list=old_font_list, new_font_list=new_font_list)
if __name__ == '__main__':
movie = Movie_Data()
movie.main()