基於 abp vNext 和 .NET Core 開發部落格專案 - 定時任務最佳實戰（二）

上一篇(https://www.cnblogs.com/meowv/p/12971041.html)使用HtmlAgilityPack抓取桌布資料成功將圖片存入資料庫，本篇繼續來完成一個全網各大平臺的熱點新聞資料的抓取。

同樣的，可以先預覽一下我個人部落格中的成品：https://meowv.com/hot ???，和抓取桌布的套路一樣，大同小異。

本次要抓取的源有18個，分別是部落格園、V2EX、SegmentFault、掘金、微信熱門、豆瓣精選、IT之家、36氪、百度貼吧、百度熱搜、微博熱搜、知乎熱榜、知乎日報、網易新聞、GitHub、抖音熱點、抖音視訊、抖音正能量。

還是將資料存入資料庫，按部就班先將實體類和自定義倉儲建立好，實體取名HotNews。貼一下程式碼：

//HotNews.cs
using System;
using Volo.Abp.Domain.Entities;

namespace Meowv.Blog.Domain.HotNews
{
    public class HotNews : Entity<Guid>
    {
        /// <summary>
        /// 標題
        /// </summary>
        public string Title { get; set; }

        /// <summary>
        /// 連結
        /// </summary>
        public string Url { get; set; }

        /// <summary>
        /// SourceId
        /// </summary>
        public int SourceId { get; set; }

        /// <summary>
        /// 建立時間
        /// </summary>
        public DateTime CreateTime { get; set; }
    }
}

剩下的大家自己完成，最終資料庫生成一張空的資料表，meowv_hotnews 。

然後還是將我們各大平臺放到一個列舉類HotNewsEnum.cs中。

//HotNewsEnum.cs
using System.ComponentModel;

namespace Meowv.Blog.Domain.Shared.Enum
{
    public enum HotNewsEnum
    {
        [Description("部落格園")]
        cnblogs = 1,

        [Description("V2EX")]
        v2ex = 2,

        [Description("SegmentFault")]
        segmentfault = 3,

        [Description("掘金")]
        juejin = 4,

        [Description("微信熱門")]
        weixin = 5,

        [Description("豆瓣精選")]
        douban = 6,

        [Description("IT之家")]
        ithome = 7,

        [Description("36氪")]
        kr36 = 8,

        [Description("百度貼吧")]
        tieba = 9,

        [Description("百度熱搜")]
        baidu = 10,

        [Description("微博熱搜")]
        weibo = 11,

        [Description("知乎熱榜")]
        zhihu = 12,

        [Description("知乎日報")]
        zhihudaily = 13,

        [Description("網易新聞")]
        news163 = 14,

        [Description("GitHub")]
        github = 15,

        [Description("抖音熱點")]
        douyin_hot = 16,

        [Description("抖音視訊")]
        douyin_video = 17,

        [Description("抖音正能量")]
        douyin_positive = 18
    }
}

和上一篇抓取桌布一樣，做一些準備工作。

在.Application.Contracts層新增HotNewsJobItem<T>，在.BackgroundJobs層新增HotNewsJob用來處理爬蟲邏輯，用建構函式方式注入倉儲IHotNewsRepository。

//HotNewsJobItem.cs
using Meowv.Blog.Domain.Shared.Enum;

namespace Meowv.Blog.Application.Contracts.HotNews
{
    public class HotNewsJobItem<T>
    {
        /// <summary>
        /// <see cref="Result"/>
        /// </summary>
        public T Result { get; set; }

        /// <summary>
        /// 來源
        /// </summary>
        public HotNewsEnum Source { get; set; }
    }
}

//HotNewsJob.CS
using Meowv.Blog.Domain.HotNews.Repositories;
using System;
using System.Net.Http;
using System.Threading.Tasks;

namespace Meowv.Blog.BackgroundJobs.Jobs.HotNews
{
    public class HotNewsJob : IBackgroundJob
    {
        private readonly IHttpClientFactory _httpClient;
        private readonly IHotNewsRepository _hotNewsRepository;

        public HotNewsJob(IHttpClientFactory httpClient,
                          IHotNewsRepository hotNewsRepository)
        {
            _httpClient = httpClient;
            _hotNewsRepository = hotNewsRepository;
        }

        public async Task ExecuteAsync()
        {
            throw new NotImplementedException();
        }
    }
}

接下來明確資料來源地址，因為以上資料來源有的返回是HTML，有的直接返回JSON資料。為了方便呼叫，我這裡還注入了IHttpClientFactory。

整理好的待抓取資料來源列表是這樣的。

...
var hotnewsUrls = new List<HotNewsJobItem<string>>
{
    new HotNewsJobItem<string> { Result = "https://www.cnblogs.com", Source = HotNewsEnum.cnblogs },
    new HotNewsJobItem<string> { Result = "https://www.v2ex.com/?tab=hot", Source = HotNewsEnum.v2ex },
    new HotNewsJobItem<string> { Result = "https://segmentfault.com/hottest", Source = HotNewsEnum.segmentfault },
    new HotNewsJobItem<string> { Result = "https://web-api.juejin.im/query", Source = HotNewsEnum.juejin },
    new HotNewsJobItem<string> { Result = "https://weixin.sogou.com", Source = HotNewsEnum.weixin },
    new HotNewsJobItem<string> { Result = "https://www.douban.com/group/explore", Source = HotNewsEnum.douban },
    new HotNewsJobItem<string> { Result = "https://www.ithome.com", Source = HotNewsEnum.ithome },
    new HotNewsJobItem<string> { Result = "https://36kr.com/newsflashes", Source = HotNewsEnum.kr36 },
    new HotNewsJobItem<string> { Result = "http://tieba.baidu.com/hottopic/browse/topicList", Source = HotNewsEnum.tieba },
    new HotNewsJobItem<string> { Result = "http://top.baidu.com/buzz?b=341", Source = HotNewsEnum.baidu },
    new HotNewsJobItem<string> { Result = "https://s.weibo.com/top/summary/summary", Source = HotNewsEnum.weibo },
    new HotNewsJobItem<string> { Result = "https://www.zhihu.com/api/v3/feed/topstory/hot-lists/total?limit=50&desktop=true", Source = HotNewsEnum.zhihu },
    new HotNewsJobItem<string> { Result = "https://daily.zhihu.com", Source = HotNewsEnum.zhihudaily },
    new HotNewsJobItem<string> { Result = "http://news.163.com/special/0001386F/rank_whole.html", Source = HotNewsEnum.news163 },
    new HotNewsJobItem<string> { Result = "https://github.com/trending", Source = HotNewsEnum.github },
    new HotNewsJobItem<string> { Result = "https://www.iesdouyin.com/web/api/v2/hotsearch/billboard/word", Source = HotNewsEnum.douyin_hot },
    new HotNewsJobItem<string> { Result = "https://www.iesdouyin.com/web/api/v2/hotsearch/billboard/aweme", Source = HotNewsEnum.douyin_video },
    new HotNewsJobItem<string> { Result = "https://www.iesdouyin.com/web/api/v2/hotsearch/billboard/aweme/?type=positive", Source = HotNewsEnum.douyin_positive },
};
...

其中有幾個比較特殊的，掘金、百度熱搜、網易新聞。

掘金需要傳送Post請求，返回的是JSON資料，並且需要指定特有的請求頭和請求資料，所有使用IHttpClientFactory建立了HttpClient物件。

百度熱搜、網易新聞兩個老大哥玩套路，網頁編碼是GB2312的，所以要專門為其指定編碼方式，不然取到的資料都是亂碼。

...
var web = new HtmlWeb();
var list_task = new List<Task<HotNewsJobItem<object>>>();

hotnewsUrls.ForEach(item =>
{
    var task = Task.Run(async () =>
    {
        var obj = new object();

        if (item.Source == HotNewsEnum.juejin)
        {
            using var client = _httpClient.CreateClient();
            client.DefaultRequestHeaders.Add("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.14 Safari/537.36 Edg/83.0.478.13");
            client.DefaultRequestHeaders.Add("X-Agent", "Juejin/Web");
            var data = "{\"extensions\":{\"query\":{ \"id\":\"21207e9ddb1de777adeaca7a2fb38030\"}},\"operationName\":\"\",\"query\":\"\",\"variables\":{ \"first\":20,\"after\":\"\",\"order\":\"THREE_DAYS_HOTTEST\"}}";
            var buffer = data.SerializeUtf8();
            var byteContent = new ByteArrayContent(buffer);
            byteContent.Headers.ContentType = new MediaTypeHeaderValue("application/json");

            var httpResponse = await client.PostAsync(item.Result, byteContent);
            obj = await httpResponse.Content.ReadAsStringAsync();
        }
        else
        {
            Encoding.RegisterProvider(CodePagesEncodingProvider.Instance);
            obj = await web.LoadFromWebAsync(item.Result, (item.Source == HotNewsEnum.baidu || item.Source == HotNewsEnum.news163) ? Encoding.GetEncoding("GB2312") : Encoding.UTF8);
        }

        return new HotNewsJobItem<object>
        {
            Result = obj,
            Source = item.Source
        };
    });
    list_task.Add(task);
});
Task.WaitAll(list_task.ToArray());

迴圈 hotnewsUrls ，可以看到HotNewsJobItem我們返回的是object型別，因為有JSON又有HtmlDocument物件。所以這裡為了能夠統一接收，就是用了object。

針對掘金做了單獨處理，使用HttpClient傳送Post請求，返回JSON字串資料。

針對百度熱搜和網易新聞，使用Encoding.RegisterProvider(CodePagesEncodingProvider.Instance);註冊編碼提供程式，然後在web.LoadFromWebAsync(...)載入網頁資料的時候指定網頁編碼，我使用了一個三元表示式來處理。

完成上面這一步，就可以迴圈 list_task，使用XPath語法，或者解析JSON資料，去拿到資料了。

...
var hotNews = new List<HotNews>();
foreach (var list in list_task)
{
    var item = await list;
    var sourceId = (int)item.Source;

    ...

    if (hotNews.Any())
    {
        await _hotNewsRepository.DeleteAsync(x => true);
        await _hotNewsRepository.BulkInsertAsync(hotNews);
    }
}

這個爬蟲同樣很簡單，只要拿到標題和連結即可，所以主要目標是尋找到頁面上的a標籤列表。這個我覺得也沒必要一個個去分析了，直接上程式碼。

// 部落格園
 if (item.Source == HotNewsEnum.cnblogs)
 {
     var nodes = ((HtmlDocument)item.Result).DocumentNode.SelectNodes("//div[@class='post_item_body']/h3/a").ToList();
     nodes.ForEach(x =>
     {
         hotNews.Add(new HotNews
         {
             Title = x.InnerText,
             Url = x.GetAttributeValue("href", ""),
             SourceId = sourceId,
             CreateTime = DateTime.Now
         });
     });
 }

// V2EX
if (item.Source == HotNewsEnum.v2ex)
{
    var nodes = ((HtmlDocument)item.Result).DocumentNode.SelectNodes("//span[@class='item_title']/a").ToList();
    nodes.ForEach(x =>
    {
        hotNews.Add(new HotNews
        {
            Title = x.InnerText,
            Url = $"https://www.v2ex.com{x.GetAttributeValue("href", "")}",
            SourceId = sourceId,
            CreateTime = DateTime.Now
        });
    });
}

 // SegmentFault
 if (item.Source == HotNewsEnum.segmentfault)
 {
     var nodes = ((HtmlDocument)item.Result).DocumentNode.SelectNodes("//div[@class='news__item-info clearfix']/a").Where(x => x.InnerText.IsNotNullOrEmpty()).ToList();
     nodes.ForEach(x =>
     {
         hotNews.Add(new HotNews
         {
             Title = x.SelectSingleNode(".//h4").InnerText,
             Url = $"https://segmentfault.com{x.GetAttributeValue("href", "")}",
             SourceId = sourceId,
             CreateTime = DateTime.Now
         });
     });
 }

// 掘金
if (item.Source == HotNewsEnum.juejin)
{
    var obj = JObject.Parse((string)item.Result);
    var nodes = obj["data"]["articleFeed"]["items"]["edges"];
    foreach (var node in nodes)
    {
        hotNews.Add(new HotNews
        {
            Title = node["node"]["title"].ToString(),
            Url = node["node"]["originalUrl"].ToString(),
            SourceId = sourceId,
            CreateTime = DateTime.Now
        });
    }
}

// 微信熱門
if (item.Source == HotNewsEnum.weixin)
{
    var nodes = ((HtmlDocument)item.Result).DocumentNode.SelectNodes("//ul[@class='news-list']/li/div[@class='txt-box']/h3/a").ToList();
    nodes.ForEach(x =>
    {
        hotNews.Add(new HotNews
        {
            Title = x.InnerText,
            Url = x.GetAttributeValue("href", ""),
            SourceId = sourceId,
            CreateTime = DateTime.Now
        });
    });
}

// 豆瓣精選
if (item.Source == HotNewsEnum.douban)
{
    var nodes = ((HtmlDocument)item.Result).DocumentNode.SelectNodes("//div[@class='channel-item']/div[@class='bd']/h3/a").ToList();
    nodes.ForEach(x =>
    {
        hotNews.Add(new HotNews
        {
            Title = x.InnerText,
            Url = x.GetAttributeValue("href", ""),
            SourceId = sourceId,
            CreateTime = DateTime.Now
        });
    });
}

// IT之家
if (item.Source == HotNewsEnum.ithome)
{
    var nodes = ((HtmlDocument)item.Result).DocumentNode.SelectNodes("//div[@class='lst lst-2 hot-list']/div[1]/ul/li/a").ToList();
    nodes.ForEach(x =>
    {
        hotNews.Add(new HotNews
        {
            Title = x.InnerText,
            Url = x.GetAttributeValue("href", ""),
            SourceId = sourceId,
            CreateTime = DateTime.Now
        });
    });
}

// 36氪
if (item.Source == HotNewsEnum.kr36)
{
    var nodes = ((HtmlDocument)item.Result).DocumentNode.SelectNodes("//div[@class='hotlist-main']/div[@class='hotlist-item-toptwo']/a[2]|//div[@class='hotlist-main']/div[@class='hotlist-item-other clearfloat']/div[@class='hotlist-item-other-info']/a").ToList();
    nodes.ForEach(x =>
    {
        hotNews.Add(new HotNews
        {
            Title = x.InnerText,
            Url = $"https://36kr.com{x.GetAttributeValue("href", "")}",
            SourceId = sourceId,
            CreateTime = DateTime.Now
        });
    });
}

// 百度貼吧
if (item.Source == HotNewsEnum.tieba)
{
    var obj = JObject.Parse(((HtmlDocument)item.Result).ParsedText);
    var nodes = obj["data"]["bang_topic"]["topic_list"];
    foreach (var node in nodes)
    {
        hotNews.Add(new HotNews
        {
            Title = node["topic_name"].ToString(),
            Url = node["topic_url"].ToString().Replace("amp;", ""),
            SourceId = sourceId,
            CreateTime = DateTime.Now
        });
    }
}

// 百度熱搜
if (item.Source == HotNewsEnum.baidu)
{
    var nodes = ((HtmlDocument)item.Result).DocumentNode.SelectNodes("//table[@class='list-table']//tr/td[@class='keyword']/a[@class='list-title']").ToList();
    nodes.ForEach(x =>
    {
        hotNews.Add(new HotNews
        {
            Title = x.InnerText,
            Url = x.GetAttributeValue("href", ""),
            SourceId = sourceId,
            CreateTime = DateTime.Now
        });
    });
}

// 微博熱搜
if (item.Source == HotNewsEnum.weibo)
{
    var nodes = ((HtmlDocument)item.Result).DocumentNode.SelectNodes("//table/tbody/tr/td[2]/a").ToList();
    nodes.ForEach(x =>
    {
        hotNews.Add(new HotNews
        {
            Title = x.InnerText,
            Url = $"https://s.weibo.com{x.GetAttributeValue("href", "").Replace("#", "%23")}",
            SourceId = sourceId,
            CreateTime = DateTime.Now
        });
    });
}

// 知乎熱榜
if (item.Source == HotNewsEnum.zhihu)
{
    var obj = JObject.Parse(((HtmlDocument)item.Result).ParsedText);
    var nodes = obj["data"];
    foreach (var node in nodes)
    {
        hotNews.Add(new HotNews
        {
            Title = node["target"]["title"].ToString(),
            Url = $"https://www.zhihu.com/question/{node["target"]["id"]}",
            SourceId = sourceId,
            CreateTime = DateTime.Now
        });
    }
}

// 知乎日報
if (item.Source == HotNewsEnum.zhihudaily)
{
    var nodes = ((HtmlDocument)item.Result).DocumentNode.SelectNodes("//div[@class='box']/a").ToList();
    nodes.ForEach(x =>
    {
        hotNews.Add(new HotNews
        {
            Title = x.InnerText,
            Url = $"https://daily.zhihu.com{x.GetAttributeValue("href", "")}",
            SourceId = sourceId,
            CreateTime = DateTime.Now
        });
    });
}

// 網易新聞
if (item.Source == HotNewsEnum.news163)
{
    var nodes = ((HtmlDocument)item.Result).DocumentNode.SelectNodes("//div[@class='area-half left']/div[@class='tabBox']/div[@class='tabContents active']/table//tr/td[1]/a").ToList();
    nodes.ForEach(x =>
    {
        hotNews.Add(new HotNews
        {
            Title = x.InnerText,
            Url = x.GetAttributeValue("href", ""),
            SourceId = sourceId,
            CreateTime = DateTime.Now
        });
    });
}

// GitHub
if (item.Source == HotNewsEnum.github)
{
    var nodes = ((HtmlDocument)item.Result).DocumentNode.SelectNodes("//article[@class='Box-row']/h1/a").ToList();
    nodes.ForEach(x =>
    {
        hotNews.Add(new HotNews
        {
            Title = x.InnerText.Trim().Replace("\n", "").Replace(" ", ""),
            Url = $"https://github.com{x.GetAttributeValue("href", "")}",
            SourceId = sourceId,
            CreateTime = DateTime.Now
        });
    });
}

// 抖音熱點
if (item.Source == HotNewsEnum.douyin_hot)
{
    var obj = JObject.Parse(((HtmlDocument)item.Result).ParsedText);
    var nodes = obj["word_list"];
    foreach (var node in nodes)
    {
        hotNews.Add(new HotNews
        {
            Title = node["word"].ToString(),
            Url = $"#{node["hot_value"]}",
            SourceId = sourceId,
            CreateTime = DateTime.Now
        });
    }
}

// 抖音視訊 & 抖音正能量
if (item.Source == HotNewsEnum.douyin_video || item.Source == HotNewsEnum.douyin_positive)
{
    var obj = JObject.Parse(((HtmlDocument)item.Result).ParsedText);
    var nodes = obj["aweme_list"];
    foreach (var node in nodes)
    {
        hotNews.Add(new HotNews
        {
            Title = node["aweme_info"]["desc"].ToString(),
            Url = node["aweme_info"]["share_url"].ToString(),
            SourceId = sourceId,
            CreateTime = DateTime.Now
        });
    }
}

將item.Result轉換成指定型別，最終拿到資料後，我們先刪除所有資料後再批量插入。

然後新建擴充套件方法UseHotNewsJob()，在模組類中呼叫。

//MeowvBlogBackgroundJobsExtensions.cs
...
        /// <summary>
        /// 每日熱點資料抓取
        /// </summary>
        /// <param name="context"></param>
        public static void UseHotNewsJob(this IServiceProvider service)
        {
            var job = service.GetService<HotNewsJob>();

            RecurringJob.AddOrUpdate("每日熱點資料抓取", () => job.ExecuteAsync(), CronType.Hour(1, 2));
        }
...

指定定時任務為每2小時執行一次。

...
        public override void OnApplicationInitialization(ApplicationInitializationContext context)
        {
            ...
            var service = context.ServiceProvider;
            ...
            service.UseHotNewsJob();
        }

編譯執行，此時週期性作業就會出現我們的定時任務了。

預設時間沒到是不會執行的，我們手動執行等待一會看看效果。

執行完成後，成功將所有熱點資料儲存在資料庫中，說明我們的爬蟲已經搞定了，並且Hangfire會按照給定的規則去迴圈執行，你學會了嗎？???

開源地址：https://github.com/Meowv/Blog/tree/blog_tutorial

基於 abp vNext 和 .NET Core 開發部落格專案 - 定時任務最佳實戰（二）

相關文章