B站測試視訊地址:
https://m.bilibili.com/video/BV1Dk4y1q781
程式碼我已經放上 Github,使用的是tp6框架,
README.md
中也有詳細的流程說明。下載程式碼後可執行http://伺服器/index/index?bvid=BV1Dk4y1q781&start=0&end=29
即可檢視效果。程式碼中 cURL 用到的
$header 請求頭
來自 Chrome,如圖:
/**
* 獲取視訊資料 cid、page、part、aid
*
* @param string $bvid 視訊的BV號
* @throws \Exception
*/
public function getPageData($bvid)
{
$url = "https://www.bilibili.com/video/" . $bvid;
// 設定 http 的請求頭
$header = [
"authority: www.bilibili.com",
"cache-control: max-age=0",
'sec-ch-ua: "Chromium";v="86", "\"Not\\A;Brand";v="99", "Google Chrome";v="86"',
'sec-ch-ua-mobile: ?0',
'upgrade-insecure-requests: 1',
'user-agent: Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Safari/537.36',
'accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'sec-fetch-site: same-origin',
'sec-fetch-mode: navigate',
'sec-fetch-user: ?1',
'sec-fetch-dest: document',
'accept-language: zh-CN,zh;q=0.9,en;q=0.8',
'cookie: finger=1295565314; bsource=search_google; _uuid=B789D864-3818-3B60-C60C-40572299324578222infoc; buvid3=43FE4003-58BE-4DFF-9EC9-D2673FBE9672138377infoc; CURRENT_FNVAL=80; blackside_state=1; sid=j0093qgz; finger=1295565314; PVID=3; rpdid=|(umR~Yuk~)k0J\'uY|RRYu)~Y'
];
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); // 返回 html
curl_setopt($ch, CURLOPT_HTTPHEADER, $header); // 設定 http 請求頭
curl_setopt($ch, CURLOPT_ENCODING, ''); // 解決亂碼
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, 0); // https 需要加這句
curl_setopt($ch, CURLOPT_SSL_VERIFYSTATUS, 0); // https 需要加這句
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); // 允許重定向
$content = curl_exec($ch);
curl_close($ch);
// //將首頁html儲存成檔案
// $fp = fopen(app()->getRootPath().'view/pages.html','w');
// fwrite($fp,$content);
// fclose($fp);
// 獲取視訊資料
$pagesData = $this->logic->getPagesJsonData($content);
if ($pagesData['code'] == 500) throw new \Exception($pagesData['msg']);
return $pagesData['data'];
}
把“首頁html儲存成檔案”這段程式碼的註釋刪掉,執行程式後就可以在
根目錄/view/pages.html
檢視 cURL 返回的內容。在 pages.html 搜尋
<script>window.__INITIAL_STATE__
可以看到我們要用正規表示式匹配的資料在這個<script>標籤內。呼叫
getPagesJsonData()
用正規表示式獲取視訊的aid、pages(含有part、cid、page)
,程式碼如下所示:/** * 根據首頁 html 匹配獲取到所有子視訊的資料 * * @param $content string 根目錄/view/index.html * @return array|\think\response\Json */ public function getPagesJsonData(string $content) { if (empty($content)) return ['code' => 500, 'msg' => '沒有需要解析的內容']; $pagesData = []; // 儲存視訊資料 // 匹配獲取 aid preg_match('/={"aid":(\d+).*/', $content, $matchAid); if (empty($matchAid)) return ['code' => 500, 'msg' => '沒有匹配到 aid']; $pagesData['aid'] = $matchAid[1]; // 匹配 pages json 陣列的資料 preg_match('/videoData.*pages\":(.*),\"subtitle\":/', $content, $matchPages); if (empty($matchPages)) return ['code' => 500, 'msg' => '沒有匹配到 pages']; $jsonToArray = json_decode($matchPages[1], true); // 迴圈處理資料 foreach ($jsonToArray as $k => $v) { $pagesData[$k]['cid'] = $v['cid']; $pagesData[$k]['page'] = $v['page']; $pagesData[$k]['part'] = $v['part']; } return ['code' => 200, 'data' => $pagesData]; }
列印 $pagesData 陣列。其結果如下:
第二步、獲取 subtitle_url
利用上一步獲取到的 cid、aid、bvid
引數值去請求介面,利用正則匹配到 subtitle_url 的值,這個值是字幕的 json 檔案連結。
/**
* 獲取視訊的cc字幕json檔案連結
*
* @param $cid int 不清楚
* @param $aid int AV號
* @param $bvid string BV號
*/
public function getSubtitleUrl(int $cid, int $aid, string $bvid)
{
$url = 'https://api.bilibili.com/x/player.so?id=' . urlencode('cid:') . $cid . '&aid=' . $aid . "&bvid=" . $bvid;
$header = [
"authority: www.bilibili.com",
'sec-ch-ua: "Chromium";v="86", "\"Not\\A;Brand";v="99", "Google Chrome";v="86"',
'user-agent: Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Safari/537.36',
'accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'origin: https://www.bilibili.com',
'sec-fetch-site: same-origin',
'sec-fetch-mode: cors',
'sec-fetch-dest: empty',
'referer: https://www.bilibili.com/',
'accept-language: zh-CN,zh;q=0.9,en;q=0.8',
'cookie: finger=1295565314; bsource=search_google; _uuid=B789D864-3818-3B60-C60C-40572299324578222infoc; buvid3=43FE4003-58BE-4DFF-9EC9-D2673FBE9672138377infoc; CURRENT_FNVAL=80; blackside_state=1; sid=j0093qgz; finger=1295565314; PVID=3; rpdid=|(umR~Yuk~)k0J\'uY|RRYu)~Y'
];
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_HTTPHEADER, $header);
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, 0);
curl_setopt($ch, CURLOPT_SSL_VERIFYSTATUS, 0);
$content = curl_exec($ch);
curl_close($ch);
// // 將 subtitle html儲存成檔案
// $fp = fopen(app()->getRootPath().'view/subtitle.html','w');
// fwrite($fp,$content);
// fclose($fp);
$subtitleData = $this->logic->getSubtitleData($content);
if ($subtitleData['code'] == 500) throw new \Exception($subtitleData['msg']);
return $subtitleData['data'];
}
把“將 subtitle html儲存成檔案”這段程式碼的註釋刪掉,執行程式後就可以在
根目錄/view/subtitle.html
檢視 cURL 返回的內容。在 subtitle.html 中可以看到我們要用正規表示式匹配的資料在這個<subtitle></subtitle>標籤內,
subtitle_url
的值即為字幕 json 檔案的地址。{ "allow_submit": false, "lan": "", "lan_doc": "", "subtitles": [ { "id": 31982954292445190, "lan": "en-US", "lan_doc": "英語(美國)", "is_lock": false, "author_mid": 483301783, "subtitle_url": "//i0.hdslb.com/bfs/subtitle/1cc78982172c6892257eb955d3feef80f2d1560c.json" }, { "id": 31982964274364420, "lan": "zh-CN", "lan_doc": "中文(中國)", "is_lock": false, "author_mid": 483301783, "subtitle_url": "//i0.hdslb.com/bfs/subtitle/fc03562711af08687398775a4423dc71028e1203.json" } ] }
呼叫
getSubtitleData()
用正規表示式獲取subtitle_url、lan
,程式碼如下所示:/** * 獲取到中文和英文cc字幕的json連結 * * @param $content string 是根目錄/view/subtitle.html * @return array */ public function getSubtitleData(string $content) { if (empty($content)) return ['code' => 500, 'data' => '沒有需要解析的內容']; $subtitleData = []; // 儲存cc字幕的json連結 // 匹配獲取 subtitle preg_match('/subtitles":(.*?)}<\/subtitle>/', $content, $matchSubtitle); if (empty($matchSubtitle)) return ['code' => 500, 'msg' => '沒有匹配到 subtitle']; $jsonToArray = json_decode($matchSubtitle[1], true); // 迴圈處理資料 foreach ($jsonToArray as $k => $v) { $subtitleData[$v['lan']] = $v['subtitle_url']; } return ['code' => 200, 'data' => $subtitleData]; }
列印 $subtitleData 陣列。其結果如下:
/**
* 獲取 json 檔案中的字幕字串,並寫入檔案
*
* @param $jsonUrl string 字幕的json檔案鏈
* @param $part string 字幕標題
* @param $preTitle int 字幕標題的字首
*/
public function getSubtitleData($jsonUrl, $part, $preTitle)
{
$url = 'https:' . $jsonUrl;
$header = [
'sec-ch-ua: "Chromium";v="86", "\"Not\\A;Brand";v="99", "Google Chrome";v="86"',
'Accept: application/json, text/javascript, */*; q=0.01',
'Referer: https://www.bilibili.com/',
'sec-ch-ua-mobile: ?0',
'User-Agent: Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Safari/537.36',
];
$ch = curl_init($url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_HTTPHEADER, $header);
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, 0);
curl_setopt($ch, CURLOPT_SSL_VERIFYSTATUS, 0);
$content = curl_exec($ch);
curl_close($ch);
// 字幕寫入檔案
$fileData = $this->logic->writeSubtitleToFile($content, $part, $this->start, $this->end, $preTitle);
if ($fileData['code'] == 500) throw new \Exception($fileData['msg']);
return $fileData['data'];
}
請求字幕json檔案地址後,返回如下內容:
{ "font_size": 0.4, "font_color": "#FFFFFF", "background_alpha": 0.5, "background_color": "#9C27B0", "Stroke": "none", "body": [ { "from": 0, "to": 3.41, "location": 2, "content": "Today we have something a little different." // 這是字幕 }, { "from": 3.41, "to": 7.41, "location": 2, "content": "Dr. Jane Goodall is going to tell you a story." // 這是字幕 }, // 省略程式碼 ] }
從上面返回的 json 陣列中,拿到字幕
content
,然後拼接起來,寫入檔案。程式碼如下:/** * 將字幕寫入檔案 * * @param $content string 字幕的json資料 * @param $part string 字幕的標題 * @param $start int 開始爬取視訊的 p 值 * @param $end int 結束爬取視訊的 p 值 * @param $preTitle int 字幕標題的字首 * @return array */ public function writeSubtitleToFile(string $content, string $part, int $start, int $end, int $preTitle) { $start += 1; $end += 1; $jsonToArray = json_decode($content, true); // 字幕在 body 中 $bodyData = $jsonToArray['body']; if (empty($bodyData)) return ['code' => 500, 'msg' => '沒有獲取到字幕']; // 拼接字幕 $ccString = "\n\n\n\nP$preTitle. " . $part . "\n"; foreach ($bodyData as $k => $v) { $ccString .= $v['content']; } // 寫入檔案 $ccString = str_replace(' ', ' ', $ccString); $filePath = app()->getRootPath() . "view/files/ClosedCaption_P$start-P$end.txt"; $fp = fopen($filePath, 'a'); $fileData = fwrite($fp, $ccString); fclose($fp); if ($fileData === false) return ['code' => 500, 'msg' => '將字幕寫入檔案失敗']; return ['code' => 200, 'data' => $fileData]; }
生成的檔案在
根目錄\view\files\
下,如圖所示:
由於時間問題,程式碼並沒有做過多的測試,如發現問題可在評論區提出
本作品採用《CC 協議》,轉載必須註明作者和本文連結