feapder框架爬取ks評論_遞迴的方式

我爱你的發表於2024-06-07
import random
import re
import time
from feapder.db.mysqldb import MysqlDB
import feapder

def is_number(string):
pattern = re.compile(r'^[-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?$')
return bool(pattern.match(string))

class AirSpiderDemo(feapder.AirSpider):

photoId = "3x2nh3ssaispdie"

db = MysqlDB()
def start_requests(self):
url = "https://www.kuaishou.com/graphql"
data = {
"operationName": "commentListQuery",
"variables": {
"photoId": self.photoId,
"pcursor": ""
},
"query": "query commentListQuery($photoId: String, $pcursor: String) {\n visionCommentList(photoId: $photoId, pcursor: $pcursor) {\n commentCount\n pcursor\n rootComments {\n commentId\n authorId\n authorName\n content\n headurl\n timestamp\n likedCount\n realLikedCount\n liked\n status\n authorLiked\n subCommentCount\n subCommentsPcursor\n subComments {\n commentId\n authorId\n authorName\n content\n headurl\n timestamp\n likedCount\n realLikedCount\n liked\n status\n authorLiked\n replyToUserName\n replyTo\n __typename\n }\n __typename\n }\n __typename\n }\n}\n"
}
yield feapder.Request(url, json=data, method="POST",download_midware=self._midware)


def _midware(self, request):
ip_list = [{'http': '23.226.117.85:8080'}, {'http': '117.69.233.218:8089'}, {'http': '212.127.93.185:8081'},
{'http': '45.12.30.112:80'}, {'http': '190.82.91.205:999'}, {'http': '94.73.239.124:55443'},
{'http': '144.255.49.43:9999'}, {'http': '125.99.34.94:8080'}, {'http': '183.164.242.66:8089'},
{'http': '47.92.6.221:8089'}, {'http': '114.231.41.157:8888'}, {'http': '60.174.1.93:8089'},
{'http': '103.143.197.19:8080'}, {'http': '194.182.163.117:3128'}, {'http': '117.69.236.113:8089'},
{'http': '190.107.236.169:999'}, {'http': '104.129.198.41:8800'}, {'http': '119.93.145.82:3128'},
{'http': '152.66.208.22:80'}, {'http': '190.111.214.234:8181'}, {'http': '104.129.198.217:8800'},
{'http': '31.197.253.254:48678'}, {'http': '85.62.218.250:3128'}, {'http': '112.243.88.8:9000'},
{'http': '46.40.6.201:7777'}, {'http': '103.48.68.36:83'}, {'http': '179.1.110.230:8080'},
{'http': '181.31.225.234:3128'}, {'http': '41.33.66.237:1976'}, {'http': '183.164.243.43:8089'},
{'http': '137.74.65.101:80'}, {'http': '106.75.86.143:1080'}, {'http': '121.41.87.128:80'},
{'http': '118.27.33.17:8118'}, {'http': '36.6.144.34:8089'}, {'http': '91.243.192.17:3128'},
{'http': '112.53.184.170:9091'}, {'http': '119.252.171.50:8080'}, {'http': '193.41.88.58:53281'},
{'http': '202.80.43.204:8080'}, {'http': '183.166.137.201:41122'}, {'http': '36.6.145.73:8089'},
{'http': '117.57.93.94:8089'}, {'http': '212.174.242.114:8080'}, {'http': '36.6.145.224:8089'},
{'http': '183.164.242.146:8089'}, {'http': '92.249.122.108:61778'}, {'http': '88.255.102.37:8080'},
{'http': '91.226.92.7:80'}, {'http': '115.236.55.186:10100'}, {'http': '198.50.237.23:80'},
{'http': '104.18.24.139:80'}, {'http': '172.67.181.107:80'}, {'http': '92.249.113.194:55443'},
{'http': '141.95.241.100:80'}, {'http': '63.239.220.109:8080'}, {'http': '41.220.104.65:8080'},
{'http': '20.219.180.149:3129'}, {'http': '198.211.117.231:80'}, {'http': '117.69.232.127:8089'},
{'http': '91.226.92.19:80'}, {'http': '105.112.95.133:8080'}, {'http': '117.71.155.108:8089'},
{'http': '187.60.219.4:3128'}, {'http': '117.71.149.100:8089'}, {'http': '117.71.133.79:8089'},
]
request.proxies = random.choice(ip_list)
request.headers = {
"Origin": "https://www.kuaishou.com",
"Referer": "https://www.kuaishou.com/short-video/3xn9n6gnnva545m",
"Sec-Fetch-Site": "same-origin",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36",

}
request.cookies = {
"did": "web_544f660eef074162a0abfbc3bfabbba8",
"didv": "1717656796000",
"kpf": "PC_WEB",
"clientid": "3",
"kpn": "KUAISHOU_VISION"
}
return request

def parse(self, request, response):
print(response)
data = response.json
# print(data)
print(data['data']['visionCommentList']['pcursor'])
pcursor_id = data['data']['visionCommentList']['pcursor']

rootComments = data['data']['visionCommentList']['rootComments']
contentlist = []
for i_dict in rootComments:
items = {}
items['content'] = i_dict['content']
contentlist.append(items)
print(contentlist)

# 在自己的資料庫要設定對應得表,建好表標後才可以資料匯入資料庫 db_3為對應得表名稱
# self.db.add_batch_smart("ks_table", contentlist)
# 睡上幾秒,免得被系統檢測是爬蟲
random_float = round(random.uniform(1.0, 2.0), 1)
time.sleep(random_float)
# 判斷響應中是否有pcursor_id,且判斷是否是數字型別,不是的話就結束爬蟲
if is_number(pcursor_id):
url = "https://www.kuaishou.com/graphql"
data = {
"operationName": "commentListQuery",
"variables": {
"photoId": self.photoId,
"pcursor": pcursor_id
},
"query": "query commentListQuery($photoId: String, $pcursor: String) {\n visionCommentList(photoId: $photoId, pcursor: $pcursor) {\n commentCount\n pcursor\n rootComments {\n commentId\n authorId\n authorName\n content\n headurl\n timestamp\n likedCount\n realLikedCount\n liked\n status\n authorLiked\n subCommentCount\n subCommentsPcursor\n subComments {\n commentId\n authorId\n authorName\n content\n headurl\n timestamp\n likedCount\n realLikedCount\n liked\n status\n authorLiked\n replyToUserName\n replyTo\n __typename\n }\n __typename\n }\n __typename\n }\n}\n"
}

yield feapder.Request(url, json=data, method="POST",callback=self.parse,download_midware=self._midware)
else:
print("pcursor_id沒有了")
return 0




if __name__ == "__main__":
AirSpiderDemo(thread_count=1).start()

相關文章