C#獲取HTML原始碼

牛腩發表於2024-03-23
C#獲取HTML原始碼
2024年03月23日記錄
以前的那個從網上找到的方法, 在一些網站上用不了,如17K,取出來的是亂碼,要麼就是一坨JS,好像是用JS又重新載入了什麼的
using System;
using System.Collections.Generic;
using System.Web;
using System.Net;
using System.IO;
using System.Text;
using System.Net.Security;    
using System.Security.Authentication;
using System.Security.Cryptography.X509Certificates;

namespace Niunan.XiaoShuo.Util
{
    /// <summary>
    /// http連線基礎類,負責底層的http通訊
    /// </summary>
    public class HttpService
    {

        public static bool CheckValidationResult(object sender, X509Certificate certificate, X509Chain chain, SslPolicyErrors errors)
        {
            //直接確認,否則打不開    
            return true;
        }

        /// <summary>
        /// post提交
        /// </summary>
        /// <param name="xml"></param>
        /// <param name="url"></param>
        /// <param name="isUseCert"></param>
        /// <param name="timeout"></param>
        /// <param name="contenttype">如:application/x-www-form-urlencoded,text/xml</param>
        /// <param name="Authorization">為空的時候就不用加,用於容聯雲通訊</param>
        /// <returns></returns>
        public static string Post(string xml, string url, bool isUseCert, int timeout,string contenttype = "application/x-www-form-urlencoded",string Authorization="")
        {
            System.GC.Collect();//垃圾回收,回收沒有正常關閉的http連線

            string result = "";//返回結果

            HttpWebRequest request = null;
            HttpWebResponse response = null;
            Stream reqStream = null;

            try
            {
                //設定最大連線數
                ServicePointManager.DefaultConnectionLimit = 200;
                //設定https驗證方式
                if (url.StartsWith("https", StringComparison.OrdinalIgnoreCase))
                {
                    ServicePointManager.ServerCertificateValidationCallback =
                            new RemoteCertificateValidationCallback(CheckValidationResult);
                }

                /***************************************************************
                * 下面設定HttpWebRequest的相關屬性
                * ************************************************************/
                request = (HttpWebRequest)WebRequest.Create(url);

                request.Method = "POST";
                request.Timeout = timeout * 1000;

                if (!string.IsNullOrEmpty(Authorization))
                {
                    request.Headers.Add(HttpRequestHeader.Authorization, Authorization);
                }
            

                //設定代理伺服器
                //WebProxy proxy = new WebProxy();                          //定義一個閘道器物件
                //proxy.Address = new Uri(WxPayConfig.PROXY_URL);              //閘道器伺服器埠:埠
                //request.Proxy = proxy;

                //設定POST的資料型別和長度
                request.ContentType =contenttype;
                byte[] data = System.Text.Encoding.UTF8.GetBytes(xml);
                request.ContentLength = data.Length;

                //是否使用證書
                if (isUseCert)
                {
                    //複製微信DEMO的,這裡不用證書
                    //string path = HttpContext.Current.Request.PhysicalApplicationPath;
                    //X509Certificate2 cert = new X509Certificate2(path + WxPayConfig.SSLCERT_PATH, WxPayConfig.SSLCERT_PASSWORD);
                    //request.ClientCertificates.Add(cert);
                    //Log.Debug("WxPayApi", "PostXml used cert");
                }

                //往伺服器寫入資料
                reqStream = request.GetRequestStream();
                reqStream.Write(data, 0, data.Length);
                reqStream.Close();

                //獲取服務端返回
                response = (HttpWebResponse)request.GetResponse();

                //獲取服務端返回資料
                StreamReader sr = new StreamReader(response.GetResponseStream(), Encoding.UTF8);
                result = sr.ReadToEnd().Trim();
                sr.Close();
            } 
            catch (Exception e)
            {
               // Log.Error("HttpService", e.ToString());
                throw  e;
            }
            finally
            {
                //關閉連線和流
                if (response != null)
                {
                    response.Close();
                }
                if(request != null)
                {
                    request.Abort();
                }
            }
            return result;
        }

        /// <summary>
        /// 處理http GET請求,返回資料
        /// </summary>
        /// <param name="url">請求的url地址</param>
        /// <returns>http GET成功後返回的資料,失敗拋WebException異常</returns>
        public static string Get(string url)
        {
            System.GC.Collect();
            string result = "";

            HttpWebRequest request = null;
            HttpWebResponse response = null;

            //請求url以獲取資料
            try
            {
                //設定最大連線數
                ServicePointManager.DefaultConnectionLimit = 200;
                //設定https驗證方式
                if (url.StartsWith("https", StringComparison.OrdinalIgnoreCase))
                {
                    ServicePointManager.ServerCertificateValidationCallback =
                            new RemoteCertificateValidationCallback(CheckValidationResult);
                }

                /***************************************************************
                * 下面設定HttpWebRequest的相關屬性
                * ************************************************************/
                request = (HttpWebRequest)WebRequest.Create(url);

                request.Method = "GET";

                //設定代理
                //WebProxy proxy = new WebProxy();
                //proxy.Address = new Uri(WxPayConfig.PROXY_URL);
                //request.Proxy = proxy;

                //獲取伺服器返回
                response = (HttpWebResponse)request.GetResponse();

                //獲取HTTP返回資料
                StreamReader sr = new StreamReader(response.GetResponseStream(), Encoding.UTF8);
                result = sr.ReadToEnd().Trim();
                sr.Close();
            } 
            catch (Exception e)
            {

                throw e;
            }
            finally
            {
                //關閉連線和流
                if (response != null)
                {
                    response.Close();
                }
                if (request != null)
                {
                    request.Abort();
                }
            }
            return result;
        }
    }
}

弄了一上午,到處問人到處查,發現下面的程式碼可以用於17K網站,
var handler = new HttpClientHandler()
{
    AutomaticDecompression = System.Net.DecompressionMethods.GZip | System.Net.DecompressionMethods.Deflate,
    UseCookies=false,
};
var httpClient = new HttpClient(handler);
var requestMessage = new HttpRequestMessage(HttpMethod.Get, url);
requestMessage.Headers.Add("Accept-encoding", "gzip, deflate, br, zstd");
var message = await httpClient.SendAsync(requestMessage);
var content = await message.Content.ReadAsStringAsync();
//後來發現這段程式碼前幾次可以抓取到,然後又抓不到了。。只能用下面的模擬瀏覽器開啟網頁抓取原始碼了

後來又來了個更狠的,用PuppeteerSharp, 相當於用程式碼來控制讓系統中的chrome瀏覽器開啟一個網頁,然後再來獲取這個網頁的原始碼
using PuppeteerSharp;  //nuget引入一下

namespace ConsoleApp2
{
    internal class Program
    {
        static async Task Main(string[] args)
        {
            await new BrowserFetcher().DownloadAsync(BrowserTag.Stable);  //自動下載他提供的無頭瀏覽器,不用這一行就得在下面指定本地的瀏覽器

            var browser = await Puppeteer.LaunchAsync(new LaunchOptions
            {
                //ExecutablePath= "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe",
                Headless = true
            });

            var page = await browser.NewPageAsync();
            await page.GoToAsync("https://www.17k.com/book/554720.html");
            await page.WaitForTimeoutAsync(2000);
            string html = await page.GetContentAsync();

            Console.WriteLine(html);

            await browser.CloseAsync();
        }
    }
}

然後還有一個playwright的也能實現操作瀏覽器開啟網頁的功能,用於自動化測試的,以前有記錄過這個名字,不過一直沒有時間看。。。主要是“懶”。。。。。
https://playwright.dev/dotnet/docs/intro

相關文章