C#獲取HTML原始碼
2024年03月23日記錄
以前的那個從網上找到的方法, 在一些網站上用不了,如17K,取出來的是亂碼,要麼就是一坨JS,好像是用JS又重新載入了什麼的
using System; using System.Collections.Generic; using System.Web; using System.Net; using System.IO; using System.Text; using System.Net.Security; using System.Security.Authentication; using System.Security.Cryptography.X509Certificates; namespace Niunan.XiaoShuo.Util { /// <summary> /// http連線基礎類,負責底層的http通訊 /// </summary> public class HttpService { public static bool CheckValidationResult(object sender, X509Certificate certificate, X509Chain chain, SslPolicyErrors errors) { //直接確認,否則打不開 return true; } /// <summary> /// post提交 /// </summary> /// <param name="xml"></param> /// <param name="url"></param> /// <param name="isUseCert"></param> /// <param name="timeout"></param> /// <param name="contenttype">如:application/x-www-form-urlencoded,text/xml</param> /// <param name="Authorization">為空的時候就不用加,用於容聯雲通訊</param> /// <returns></returns> public static string Post(string xml, string url, bool isUseCert, int timeout,string contenttype = "application/x-www-form-urlencoded",string Authorization="") { System.GC.Collect();//垃圾回收,回收沒有正常關閉的http連線 string result = "";//返回結果 HttpWebRequest request = null; HttpWebResponse response = null; Stream reqStream = null; try { //設定最大連線數 ServicePointManager.DefaultConnectionLimit = 200; //設定https驗證方式 if (url.StartsWith("https", StringComparison.OrdinalIgnoreCase)) { ServicePointManager.ServerCertificateValidationCallback = new RemoteCertificateValidationCallback(CheckValidationResult); } /*************************************************************** * 下面設定HttpWebRequest的相關屬性 * ************************************************************/ request = (HttpWebRequest)WebRequest.Create(url); request.Method = "POST"; request.Timeout = timeout * 1000; if (!string.IsNullOrEmpty(Authorization)) { request.Headers.Add(HttpRequestHeader.Authorization, Authorization); } //設定代理伺服器 //WebProxy proxy = new WebProxy(); //定義一個閘道器物件 //proxy.Address = new Uri(WxPayConfig.PROXY_URL); //閘道器伺服器埠:埠 //request.Proxy = proxy; //設定POST的資料型別和長度 request.ContentType =contenttype; byte[] data = System.Text.Encoding.UTF8.GetBytes(xml); request.ContentLength = data.Length; //是否使用證書 if (isUseCert) { //複製微信DEMO的,這裡不用證書 //string path = HttpContext.Current.Request.PhysicalApplicationPath; //X509Certificate2 cert = new X509Certificate2(path + WxPayConfig.SSLCERT_PATH, WxPayConfig.SSLCERT_PASSWORD); //request.ClientCertificates.Add(cert); //Log.Debug("WxPayApi", "PostXml used cert"); } //往伺服器寫入資料 reqStream = request.GetRequestStream(); reqStream.Write(data, 0, data.Length); reqStream.Close(); //獲取服務端返回 response = (HttpWebResponse)request.GetResponse(); //獲取服務端返回資料 StreamReader sr = new StreamReader(response.GetResponseStream(), Encoding.UTF8); result = sr.ReadToEnd().Trim(); sr.Close(); } catch (Exception e) { // Log.Error("HttpService", e.ToString()); throw e; } finally { //關閉連線和流 if (response != null) { response.Close(); } if(request != null) { request.Abort(); } } return result; } /// <summary> /// 處理http GET請求,返回資料 /// </summary> /// <param name="url">請求的url地址</param> /// <returns>http GET成功後返回的資料,失敗拋WebException異常</returns> public static string Get(string url) { System.GC.Collect(); string result = ""; HttpWebRequest request = null; HttpWebResponse response = null; //請求url以獲取資料 try { //設定最大連線數 ServicePointManager.DefaultConnectionLimit = 200; //設定https驗證方式 if (url.StartsWith("https", StringComparison.OrdinalIgnoreCase)) { ServicePointManager.ServerCertificateValidationCallback = new RemoteCertificateValidationCallback(CheckValidationResult); } /*************************************************************** * 下面設定HttpWebRequest的相關屬性 * ************************************************************/ request = (HttpWebRequest)WebRequest.Create(url); request.Method = "GET"; //設定代理 //WebProxy proxy = new WebProxy(); //proxy.Address = new Uri(WxPayConfig.PROXY_URL); //request.Proxy = proxy; //獲取伺服器返回 response = (HttpWebResponse)request.GetResponse(); //獲取HTTP返回資料 StreamReader sr = new StreamReader(response.GetResponseStream(), Encoding.UTF8); result = sr.ReadToEnd().Trim(); sr.Close(); } catch (Exception e) { throw e; } finally { //關閉連線和流 if (response != null) { response.Close(); } if (request != null) { request.Abort(); } } return result; } } }
弄了一上午,到處問人到處查,發現下面的程式碼可以用於17K網站,
var handler = new HttpClientHandler() { AutomaticDecompression = System.Net.DecompressionMethods.GZip | System.Net.DecompressionMethods.Deflate, UseCookies=false, }; var httpClient = new HttpClient(handler); var requestMessage = new HttpRequestMessage(HttpMethod.Get, url); requestMessage.Headers.Add("Accept-encoding", "gzip, deflate, br, zstd"); var message = await httpClient.SendAsync(requestMessage); var content = await message.Content.ReadAsStringAsync(); //後來發現這段程式碼前幾次可以抓取到,然後又抓不到了。。只能用下面的模擬瀏覽器開啟網頁抓取原始碼了
後來又來了個更狠的,用PuppeteerSharp, 相當於用程式碼來控制讓系統中的chrome瀏覽器開啟一個網頁,然後再來獲取這個網頁的原始碼
using PuppeteerSharp; //nuget引入一下 namespace ConsoleApp2 { internal class Program { static async Task Main(string[] args) { await new BrowserFetcher().DownloadAsync(BrowserTag.Stable); //自動下載他提供的無頭瀏覽器,不用這一行就得在下面指定本地的瀏覽器 var browser = await Puppeteer.LaunchAsync(new LaunchOptions { //ExecutablePath= "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe", Headless = true }); var page = await browser.NewPageAsync(); await page.GoToAsync("https://www.17k.com/book/554720.html"); await page.WaitForTimeoutAsync(2000); string html = await page.GetContentAsync(); Console.WriteLine(html); await browser.CloseAsync(); } } }
然後還有一個playwright的也能實現操作瀏覽器開啟網頁的功能,用於自動化測試的,以前有記錄過這個名字,不過一直沒有時間看。。。主要是“懶”。。。。。
https://playwright.dev/dotnet/docs/intro