實戰-快手H5字型反爬
前言
快手H5端的粉絲數是字型反爬,抓到的html文字是亂碼 <SPAN STYLE='FONT-FAMILY: kwaiFont;'></SPAN>
可以看到對應的字型格式為 kwaiFont
。
經過一頓分析操作,發現每次返回的ttf檔案內容每次都不太一樣,無法自己做一份對映模板, 那麼就不做模板了。可以通過OCR
或者 KNN
進行內容識別。本人採用 OCR
方式進行識別。這裡推薦一個很吊的 OCR
庫 ddddocr.
流程分析
- 找到對應ttf檔案
- 分析ttf檔案,將每個字型轉換成圖片
- 圖片識別成文字
- 亂碼對映
直接上程式碼
import re
import ddddocr
import requests
from lxml import etree
from io import BytesIO
from fontTools.ttLib import TTFont
from fontTools.pens.basePen import BasePen
from reportlab.graphics.shapes import Path
from reportlab.lib import colors
from reportlab.graphics import renderPM
from reportlab.graphics.shapes import Group, Drawing
class ReportLabPen(BasePen):
"""
繪圖
"""
def __init__(self, glyph_set, path=None):
BasePen.__init__(self, glyph_set)
if path is None:
path = Path()
self.path = path
def _moveTo(self, p):
(x, y) = p
self.path.moveTo(x, y)
def _lineTo(self, p):
(x, y) = p
self.path.lineTo(x, y)
def _curveToOne(self, p1, p2, p3):
(x1, y1) = p1
(x2, y2) = p2
(x3, y3) = p3
self.path.curveTo(x1, y1, x2, y2, x3, y3)
def _closePath(self):
self.path.closePath()
class KuaiShouSpider(object):
"""
快手爬蟲
"""
def __init__(self):
# OCR 識別類
self.ocr = ddddocr.DdddOcr()
def ttf_2_word_map(self, ttf_content, fmt="png"):
"""
ttf內容轉文字
:param ttf_content:
:param fmt:
:return:
"""
font = TTFont(BytesIO(ttf_content))
gs = font.getGlyphSet()
glyphNames = font.getGlyphNames()
uniMap = font['cmap'].tables[0].ttFont.getBestCmap()
key_map = dict()
for k, v in uniMap.items():
key_map[v] = hex(k)
data_dict = dict()
for i in glyphNames:
# 跳過'.notdef', '.null'
if i[0] == '.':
continue
g = gs[i]
pen = ReportLabPen(gs, Path(fillColor=colors.black, strokeWidth=5))
g.draw(pen)
w = 800
h = 800
g = Group(pen.path)
g.translate(0, 0)
d = Drawing(w, h)
d.add(g)
img = renderPM.drawToString(d, fmt)
data = self.ocr.classification(img)
if data == '十':
data = '+'
elif data in [',', '。']:
data = '.'
key = key_map[i]
data_dict[key] = data
return data_dict
@staticmethod
def uni_code_2_word(uni_code, word_map):
"""
unicode 轉 文字
:param uni_code:
:param word_map:
:return:
"""
def _sub(num):
num = num.group()
num = re.findall(r'\d+', num)[0]
num = str(hex(int(num)))
return word_map[num]
data = re.sub('&#(.+?);', _sub, uni_code)
return data
def get_user_info(self):
"""
獲取使用者資料
:return:
"""
url = 'https://c.kuaishou.com/fw/user/ounixiong?fid=0&cc=share_copylink&followRefer=151&shareMethod=TOKEN&kpn=KUAISHOU&subBiz=PROFILE&shareId=16509009682073&shareToken=X-7IIolIHVVgN2bx&shareResourceType=PROFILE_OTHER&shareMode=APP&originShareId=16509009682073&appType=21&shareObjectId=136457866&shareUrlOpened=0×tamp=1633759010452&captchaToken=HEADCgp6dC5jYXB0Y2hhEscCX569ztU1Y9XCAVp1Q5Rsm1H8fPYfPZBHvTyg5mwPyIQrJSR_j2mphorguzP9cB2sNWhg61OwW_LQEBvnHRS47j0GpmjIBOeqJ9j9kIbNTsXgNSQYZxkdToAm25EKa4ZLXOmE9ez5Bl-UMzRs4P2_g6SzI3fBs1yFvI7_eLd_yFogwimBE5eyopG9qDDm5lFPfSPm0GI6IhqLKpA1VBZd9cjZxsxq4jGlld1vYRxOFyfJis4oFSVM8fpDArN32KQ2pqejgjV8kK42jW-kpg4fl-1g5iWmqSczszEvEdB9s4l3QmQBfztuDSPbGf0yfY-whf93nOynaRmSeLH49sHSaPr_nwcGvjNjqeFdZoTpf2VBLV7mWvkVdthG0yV5Y6BqDPWSr57Js-dvLIcYlyq3gLbNxQOsulNch6o-HQ7dw2CZY006z-_eGhLniyxQb2WiE0ZVkCv0UGAb2gsoBTACTAIL'
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
'Cookie': 'did=web_232e842d3bcd4eceb358abfcf31ec030; didv=1634614098000; sid=e7921611a1cbb9669d28ce19; Hm_lvt_86a27b7db2c5c0ae37fee4a8a35033ee=1634614100; Hm_lpvt_86a27b7db2c5c0ae37fee4a8a35033ee=1634614104',
'DNT': '1',
'Host': 'c.kuaishou.com',
'Pragma': 'no-cache',
'Referer': 'https://c.kuaishou.com/fw/user/ounixiong?fid=0&cc=share_copylink&followRefer=151&shareMethod=TOKEN&kpn=KUAISHOU&subBiz=PROFILE&shareId=16509009682073&shareToken=X-7IIolIHVVgN2bx&shareResourceType=PROFILE_OTHER&shareMode=APP&originShareId=16509009682073&appType=21&shareObjectId=136457866&shareUrlOpened=0×tamp=1633759010452',
'sec-ch-ua': '"Chromium";v="94", "Google Chrome";v="94", ";Not A Brand";v="99"',
'sec-ch-ua-mobile': '?1',
'sec-ch-ua-platform': '"Android"',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'same-origin',
'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Mobile Safari/537.36',
}
response = requests.get(url, headers=headers)
# 獲取網頁中的ttf檔案
try:
ttf_file = re.findall(r'url\((https:.+?\.ttf)\)', response.text)[0]
except Exception as err:
print('網頁訪問異常')
return
ttf_data = requests.get(ttf_file)
ttf_word = self.ttf_2_word_map(ttf_data.content)
# 解析
html = etree.HTML(response.text)
fans_node = html.xpath('//span[contains(text(),"粉絲")]/preceding-sibling::span[1]')[0]
focus_node = html.xpath('//span[contains(text(),"關注")]/preceding-sibling::span[1]')[0]
fans = etree.tostring(fans_node).decode('utf-8')
focus = etree.tostring(focus_node).decode('utf-8')
fans = re.findall('>(.+?)<', fans)[0]
focus = re.findall('>(.+?)<', focus)[0]
fans = self.uni_code_2_word(fans, ttf_word)
focus = self.uni_code_2_word(focus, ttf_word)
print(fans)
print(focus)
if __name__ == '__main__':
spider = KuaiShouSpider()
spider.get_user_info()
後記
可以考慮一下用 KNN的方式根據字型特徵進行分類,準備好一些樣本,進行訓練.