多執行緒爬取B站視訊

learner_witt發表於2020-10-13
#threading.Thread()

import os
import re
import time
import datetime
import requests
import threading
from moviepy.editor import *


# 獲取視訊及音訊的源地址
def get_url(url):
	url = 'https://www.bilibili.com/video/' + video_id + '?from=search'
	headers = {
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36'
}
	response = requests.get(url=url, headers=headers).text
	video_url = re.search('duration.*?baseUrl":"(.*?)"', response)
	video_url = video_url.group(1)
	audio_url = re.search('audio.*?baseUrl":"(.*?)"', response)
	audio_url = audio_url.group(1)
	return video_url,audio_url


#多執行緒下載
def download(url_1, video_id):
	all_thread = 1
	url_2 = 'https://www.bilibili.com/video/' + video_id + '?from=search'
#獲取視訊大小
	headers = {
	'Referer': url_2,
	'Range': 'bytes=0-10000',
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36'
}
	file = requests.get(url=url_1, headers=headers)
	file_size = int(file.headers['Content-Range'][14:])	
	print('video size:' + str(int(file_size / 1024 / 1024)) + "MB")
	if file_size:
		fp = open('2.mp4', 'wb')
		fp.truncate(file_size)		
		print('視訊大小:' + str(int(file_size / 1024 / 1024)) + "MB")
		fp.close()
	size = 5242880
	if file_size > size:
		all_thread = int(file_size / size)
		if all_thread > 10:
			all_thread = 10
	part = file_size // all_thread
	threads = []
	starttime = datetime.datetime.now().replace(microsecond=0)
	for i in range(all_thread):
		start = part * i
		if i == all_thread - 1:
			end = file_size
		else:
			end = start + part
		if i > 0:
			start += 1
		headers = headers.copy()
		headers['Range'] = 'bytes=%s-%s' % (start, end)
		t = threading.Thread(target=Handler, name='th-' + str(i), kwargs = {'start': start, 'end': end, 'url': url_1, 'filename': '2.mp4', 'headers': headers})
		t.setDaemon(True)	#隨程式退出的標記
		threads.append(t)
	for t in threads:
		time.sleep(0.2)
		t.start()
	for t in threads:
		t.join()

def Handler(start, end, url, filename, headers={}):
	tt_name = threading.current_thread().getName()
	print(tt_name + 'is begin\t')
	r = requests.get(url, headers=headers, stream=True)
	total_size = end - start
	downsize = 0
	startTime = time.time()
	with open(filename, 'r+b') as fp:
		fp.seek(start)
		var = fp.tell()  #獲得檔案指標位置
		for chunk in r.iter_content(204800):   #邊下載邊存硬碟
			if chunk:
				fp.write(chunk)
				downsize += len(chunk)
				line = tt_name + '-downloading %d KB/s - %.2f MB, 共 %.2f MB'
				line = line % (downsize / 1024 / (time.time() - startTime), downsize / 1024 / 1024,total_size / 1024 / 1024)
				print(line, end='\r')
def get_headers():
	pass
if __name__ == '__main__':
	video_id = input('please input Bv number:')
	base_url = 'https://www.bilibili.com/'
	url = base_url + '/video/' + video_id + '?from=search'
	video_url = get_url(url)
	video_url = video_url[0]
	download(video_url, video_id)

多執行緒爬取B站視訊,後邊儘量更加完善,更加美觀。

多執行緒參考:https://blog.csdn.net/s_kangkang_A/article/details/103051184

相關文章