import json
import random
import re
import subprocess
from functools import partial
from faker import Faker
from parsel import Selector
fake = Faker(locale='zh_CN')
subprocess.Popen = partial(subprocess.Popen, encoding='utf-8')
import execjs
import requests
def read_js(js_data, funcs_name, var=None, var2=None):
# jj = execjs.compile(js_data, cwd=r"J:\抖音爬蟲逆向專案\node_modules")
jj = execjs.compile(js_data)
# call() 執行程式碼中的xxx函式. 後續的引數是xxx的引數
res = jj.call(funcs_name, var, var2)
return res
def read_jsfile(js_file, funcs_name, var=None, var2=None):
with open(js_file, 'r') as f:
js_code2 = f.read()
# jj = execjs.compile(js_code2, cwd=r"J:\抖音爬蟲逆向專案\node_modules")
jj = execjs.compile(js_code2)
# call() 執行程式碼中的xxx函式. 後續的引數是xxx的引數
res = jj.call(funcs_name, var, var2)
return res
def re_method(str, one_data, last_data):
# str = 'HelloHsadoHazzzoHo'
# pat = 'H(.*?)o'
pat = rf'{one_data}(.*?){last_data}'
# 全域性匹配函式
result = re.compile(pat).findall(str)
return result
def re_method2(str, par_data):
# str = 'HelloHsadoHazzzoHo'
# pat = 'H(.*?)o'
# 全域性匹配函式
result = re.compile(par_data).findall(str)
return result
def ip_proxies():
file_path = 'ip.txt'
ip_list = []
with open(file_path, 'r') as file:
line = file.readline()
while line:
# print(line.strip()) # 列印每一行並去除換行符
line = file.readline().strip()
ip_list.append(line)
# print(len(ip_list))
return ip_list
def get_api_iplist():
f_ua = fake.user_agent()
# url = "https://www.baidu.com/" # 測試網站url地址
url = "http://api.89ip.cn/tqdl.html?api=1&num=600&port=&address=&isp=" # 測試網站url地址
headers = {'User-Agent': f_ua} # 請求頭
html = requests.get(url=url, headers=headers).text # 獲取響應內容
iplist1 = re_method(html, "<br>", "<br>")
# print(len(iplist1))
return iplist1
# ——————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————
Url = "http://hello.luck01.pro/"
# 第一個頁面的資料爬取
def get_one_paga_data(url):
f_ua = fake.user_agent()
headers = {
"Host": "hello.luck01.pro",
"Referer": "http://hello.luck01.pro/",
"User-Agent": f_ua
}
proxies = {
'https': 'https://' + random.choice(ip_proxies())
}
f"http://hello.luck01.pro/index.php/vod/type/id/13/page/1.html"
resp = requests.get(url=url, headers=headers,
proxies=proxies).text
# resp = requests.get(url="http://hello.luck01.pro/index.php/vod/play/id/173531/sid/1/nid/1.html", headers=headers,
# proxies=proxies).text
one_data = "var str = '"
last_data = "'"
data = re_method(str=resp, one_data=one_data, last_data=last_data)[0]
# print(data)
# html_data = " "
#
html = read_jsfile(js_file="sese_web.js", funcs_name="f", var=data)
sel = Selector(html)
f_url1list = sel.xpath('//li[@class="content-item"]/a/@href').getall()
url1list = [(Url + url1) for url1 in f_url1list]
titlelist = sel.xpath('//li[@class="content-item"]/a/@title').getall()
fsrclist = sel.xpath('//li[@class="content-item"]/a/img/@data-original').getall()
srclist = [src for src in fsrclist]
# 爬蟲url連結處理後可以直接得到最後一頁的urllist
url2list = [(url1.split(".html")[0] + "/sid/1/nid/1.html") for url1 in url1list]
data = {
"titlelist": titlelist,
"srclist": srclist,
"url2list": url2list,
}
# print(data)
return data
# 最後一頁的爬取和資料的下載
def get_last_paga_data_and_download(data, num, num_max):
titlelist2 = data["titlelist"]
srclist2 = data["srclist"]
m3u8_list = []
url2list = data["url2list"]
for l in url2list:
# l = "http://hello.luck01.pro//index.php/vod/detail/id/173550/sid/1/nid/1.html"
last_url = l.replace("detail", "play")
f_ua = fake.user_agent()
headers = {
"Host": "hello.luck01.pro",
"Referer": "http://hello.luck01.pro/",
"User-Agent": f_ua
}
ip_random = random.choice(ip_proxies())
proxies_s = {
'https': 'https://' + ip_random
}
proxies = {
'http': 'http://' + ip_random
}
# print(proxies)
# print(proxies_s)
resp = requests.get(url=last_url, headers=headers, proxies=proxies_s).text
# resp = requests.get(url="http://hello.luck01.pro/index.php/vod/play/id/173531/sid/1/nid/1.html", headers=headers,
# proxies=proxies).text
one_data = "var str = '"
last_data = "'"
data = re_method(str=resp, one_data=one_data, last_data=last_data)[0]
# print(data)
# html_data = " "
# 透過node.js執行js程式碼解密響應資料的程式碼,得到解密的資料
js_to_html = read_jsfile(js_file="sese_web.js", funcs_name="f", var=data)
f_data = '"url":"'
b_data = '","url_next":'
# m3u8資料透過正則提取
fm3u8url1 = re_method(js_to_html, f_data, b_data)[0]
# 資料清洗得到第一個m3u8的資料正確的資料
m3u8url1 = fm3u8url1.replace("\\", "")
# resp_m3u8 = requests.get(url=m3u8url1).text
resp_ = requests.get(url=m3u8url1, proxies=proxies)
m3u8 = resp_.text
# print(m3u8)
# 資料清洗得到第二個m3u8的資料正確的資料
m3u8_2url = "https://vodvip888.com" + re_method2(m3u8, "#EXT-X-STREAM-INF.*?\\n(.*?).m3u8")[0] + ".m3u8"
m3u8_list.append(m3u8_2url)
items = {}
for title_, src_, m3u8_url in zip(titlelist2, srclist2, m3u8_list):
items['title'] = title_
items['cover'] = src_
items['m3u8_url'] = m3u8_url
# print(items)
with open('file.json', "a", encoding="utf-8") as f:
# f.write(json.dumps(items,ensure_ascii=False,sort_keys=True))
f.write(json.dumps(items, ensure_ascii=False))
num += 1
print(items)
print(num)
if num < num_max:
f.write(",")
import pymysql
def download_to_json():
num = 0
i = 2
num_max = (i - 1) * 16
with open('file.json', "w", encoding="utf-8") as f:
f.write("[")
for i in range(1, i):
url = f"http://hello.luck01.pro/index.php/vod/type/id/13/page/{i}.html"
get_last_paga_data_and_download(data=get_one_paga_data(url=url), num_max=num_max, num=num)
with open('file.json', "a", encoding="utf-8") as f:
f.write("]")
def write_mysql_data(host, user, passwd, db, table_name):
# 讀取json檔案資料,轉成字典
data = open("file.json", encoding="utf-8").read()
data_dict_list = json.loads(data)
db = pymysql.connect(host=host, user=user, passwd=passwd, db=db)
cursor = db.cursor()
sql = f"INSERT INTO {table_name}(title,cover,m3u8_url) values (%s,%s,%s)"
for data_dict in data_dict_list:
values = (data_dict["title"], data_dict["cover"], data_dict["m3u8_url"])
cursor.execute(sql, values)
db.commit()
db.close()
def create_mysql_dbtable(table_name):
# 填寫host user password db
db = pymysql.connect(
host="127.0.0.1",
user="root",
password="123456",
db="spider"
# port = 3306, # 資料庫埠號
)
cursor = db.cursor()
# sql例子 sese_db表名 下面的就是欄位資料 id自增
sql = f"""
create table if not exists {table_name} (
id INT AUTO_INCREMENT PRIMARY KEY,
title varchar(255) not null,
cover varchar(255) not null,
m3u8_url varchar(255) not null
)
"""
cursor.execute(sql)
db.close()
def json_file_to_mysqldb():
table_name = "vip_data"
# 建立資料庫中的爬蟲資料表
create_mysql_dbtable(table_name)
# 將生產json檔案資料寫入
write_mysql_data(host="127.0.0.1", user="root", passwd="123456", db="spider", table_name=table_name)
def main():
# 爬取資料轉換成json檔案
download_to_json()
# json_file_to_mysqldb()
if __name__ == '__main__':
main()