Golang框架beego電影網爬蟲小試牛刀

OldBoy~發表於2018-09-25

學習了一段時間golang,又參考課程學習了beego開發網站爬蟲,專案的目錄結構是:

採集的目標是豆瓣網電影,入口地址是:https://movie.douban.com/subject/1900841/?from=subject-page

資料表結構

CREATE TABLE `movie_info` (
  `id` int(10) unsigned NOT NULL AUTO_INCREMENT,
  `movie_id` int(11) unsigned NOT NULL COMMENT '電影id',
  `movie_name` varchar(100) DEFAULT NULL COMMENT '電影名稱',
  `movie_pic` varchar(200) DEFAULT NULL COMMENT '電影圖片',
  `movie_director` varchar(50) DEFAULT NULL COMMENT '電影導演',
  `movie_writer` varchar(50) DEFAULT NULL COMMENT '電影編劇',
  `movie_country` varchar(50) DEFAULT NULL COMMENT '電影產地',
  `movie_language` varchar(50) DEFAULT NULL COMMENT '電影語言',
  `movie_main_character` varchar(50) DEFAULT NULL COMMENT '電影主演',
  `movie_type` varchar(50) DEFAULT NULL COMMENT '電影型別',
  `movie_on_time` timestamp NOT NULL DEFAULT '0000-00-00 00:00:00' COMMENT '電影上映時間',
  `movie_span` varchar(20) DEFAULT NULL COMMENT '電影時長',
  `movie_grade` varchar(5) DEFAULT NULL COMMENT '電影評分',
  `remark` varchar(500) DEFAULT '' COMMENT '備註',
  `create_time` timestamp NOT NULL DEFAULT '0000-00-00 00:00:00' COMMENT '建立時間',
  `modify_time` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '修改時間',
  `status` tinyint(1) DEFAULT '1',
  PRIMARY KEY (`id`),
  KEY `idx_movie_id` (`movie_id`),
  KEY `idx_create_time` (`create_time`),
  KEY `idx_modify_time` (`modify_time`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8 COMMENT='電影資訊表';

路由檔案router.go

//router.go檔案
package routers

import (
    "crawl_movie/controllers"

    "github.com/astaxie/beego"
)

func init() {
    beego.Router("/", &controllers.MainController{})
    beego.Router("/crawl_movie", &controllers.CrawlMovieController{}, "*:CrawlMovie")
}

控制器下檔案

//crawlMovie.go 檔案
package controllers

import (
    "crawl_movie/models"
    "fmt" 
    "runtime"
    "time"

    "github.com/astaxie/beego"
    "github.com/astaxie/beego/httplib"
)

type CrawlMovieController struct {
    beego.Controller
}

func PrintErr() {
    if err := recover(); err != nil {
        fmt.Printf("%v", err)
        for i := 0; i < 10; i++ {
            funcName, file, line, ok := runtime.Caller(i)
            if ok {
                fmt.Printf("frame %v:[func:%v,file:%v,line:%v]\n", i, runtime.FuncForPC(funcName).Name(), file, line)
            }
        }
    }
}
func (c *CrawlMovieController) CrawlMovie() {
    PrintErr()
    var movieInfo models.MovieInfo //先宣告電影資訊結構

    models.ConnectRedis("127.0.0.1:6379") //連線redis

    //爬蟲入口url
    sUrl := "https://movie.douban.com/subject/1900841/?from=subject-page" //這裡作為入口
    models.PutinQueue(sUrl)

    for {
        length := models.GetQueueLength()
        c.Ctx.WriteString(fmt.Sprintf("---%v---", length))
        if length == 0 {
            break //如果url佇列為空,則退出當前迴圈
        }
        sUrl = models.PopfromQueue()
        //判斷url是否已經被訪問過
        if models.IsVisit(sUrl) { //訪問過則跳過
            continue
        }
        rsp := httplib.Get(sUrl)
        //設定User-agent以及cookie是為了防止  豆瓣網的 403
        rsp.Header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0")
        rsp.Header("Cookie", `bid=gFP9qSgGTfA; __utma=30149280.1124851270.1482153600.1483055851.1483064193.8; __utmz=30149280.1482971588.4.2.utmcsr=douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/; ll="118221"; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1483064193%2C%22https%3A%2F%2Fwww.douban.com%2F%22%5D; _pk_id.100001.4cf6=5afcf5e5496eab22.1482413017.7.1483066280.1483057909.; __utma=223695111.1636117731.1482413017.1483055857.1483064193.7; __utmz=223695111.1483055857.6.5.utmcsr=douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/; _vwo_uuid_v2=BDC2DBEDF8958EC838F9D9394CC5D9A0|2cc6ef7952be8c2d5408cb7c8cce2684; ap=1; viewed="1006073"; gr_user_id=e5c932fc-2af6-4861-8a4f-5d696f34570b; __utmc=30149280; __utmc=223695111; _pk_ses.100001.4cf6=*; __utmb=30149280.0.10.1483064193; __utmb=223695111.0.10.1483064193`)
        sMovieHtml, err := rsp.String()

        if err != nil {
            panic(err)
        }

        //獲取電影名稱
        movieInfo.Movie_name = models.GetMovieName(sMovieHtml)
        if movieInfo.Movie_name != "" { //如果為空,則說明不是電影,如果不為空,則是電影
            //獲取電影導演
            movieInfo.Movie_director = models.GetMovieDirector(sMovieHtml)
            //獲取主演
            movieInfo.Movie_main_character = models.GetMovieMainCharacters(sMovieHtml)
            //電影型別
            movieInfo.Movie_type = models.GetMovieGenre(sMovieHtml)
            //上映時間
            movieInfo.Movie_on_time = models.GetMovieOnTime(sMovieHtml)
            //評分
            movieInfo.Movie_grade = models.GetMovieGrade(sMovieHtml)
            //時長
            movieInfo.Movie_span = models.GetMovieRunningTime(sMovieHtml)
            //    c.Ctx.WriteString(fmt.Sprintf("%v", movieInfo))
            //入庫
            models.AddMovie(&movieInfo)
            //    id, _ := models.AddMovie(&movieInfo)
            //    c.Ctx.WriteString(fmt.Sprintf("%v", id))
        }

        //提取該頁面的所有連線
        urls := models.GetMovieUrls(sMovieHtml)

        //遍歷url
        //為了把url寫入佇列
        //同樣需要開啟一個協程,這個協程專門負責從佇列中取,負責get,set,
        //第一判斷這個url是不是一個電影,是的話加入到資料庫,
        //    第二是提取這個電影有關的url
        //第三把url放入set(集合)裡,表明這個url已經訪問過
        for _, url := range urls {
            models.PutinQueue(url)
            c.Ctx.WriteString("<br>" + url + "</br>")
        }
        //sUrl 需要記錄到set集合裡,表明這個url訪問過
        models.AddToSet(sUrl)
        time.Sleep(time.Second) //適當休息
    }
    c.Ctx.WriteString("爬蟲執行結束")

}

models目錄下檔案

//movie_info.go 檔案
package models

import (
    "regexp" //正則包
    "strings"

    "github.com/astaxie/beego/orm"
    _ "github.com/go-sql-driver/mysql"
)

var (
    db orm.Ormer
)

type MovieInfo struct {
    Id                   int64
    Movie_id             int64
    Movie_name           string
    Movie_pic            string
    Movie_director       string
    Movie_writer         string
    Movie_country        string
    Movie_language       string
    Movie_main_character string
    Movie_type           string
    Movie_on_time        string
    Movie_span           string
    Movie_grade          string
    Create_time          string
}

func init() {
    orm.Debug = true //是否開啟除錯模式,除錯模式下會列印sql語句
    orm.RegisterDataBase("default", "mysql", "root:root@tcp(127.0.0.1:3306)/beego?charset=utf8")
    orm.RegisterModel(new(MovieInfo))
    db = orm.NewOrm()
}

//新增電影
func AddMovie(movie_info *MovieInfo) (int64, error) {
    id, err := db.Insert(movie_info)
    return id, err
}

//獲取導演名
func GetMovieDirector(movieHtml string) string {
    if movieHtml == "" {
        return ""
    }

    reg := regexp.MustCompile(`<a.*?rel="v:directedBy">(.*)</a>`)
    result := reg.FindAllStringSubmatch(movieHtml, -1)

    return string(result[0][1])
}

//獲取電影名

func GetMovieName(movieHtml string) string {
    if movieHtml == "" {
        return ""
    }

    reg := regexp.MustCompile(`<span\s*property="v:itemreviewed">(.*?)</span>`)
    result := reg.FindAllStringSubmatch(movieHtml, -1)

    if len(result) == 0 {
        return ""
    }

    return string(result[0][1])
}

//獲取主演

func GetMovieMainCharacters(movieHtml string) string {
    reg := regexp.MustCompile(`<a.*?rel="v:starring">(.*?)</a>`)
    result := reg.FindAllStringSubmatch(movieHtml, -1)

    mainCharacters := ""
    if len(result) == 0 {
        return mainCharacters
    }
    for _, v := range result {
        mainCharacters += v[1] + "/"
    }

    return strings.Trim(mainCharacters, "/")
}

//獲取電影主演
func GetMovieGrade(movieHtml string) string {
    reg := regexp.MustCompile(`<strong.*?property="v:average">(.*?)</strong>`)
    result := reg.FindAllStringSubmatch(movieHtml, -1)

    if len(result) == 0 {
        return ""
    }
    return string(result[0][1])
}

//獲取電影型別
func GetMovieGenre(movieHtml string) string {
    reg := regexp.MustCompile(`<span.*?property="v:genre">(.*?)</span>`)
    result := reg.FindAllStringSubmatch(movieHtml, -1)

    if len(result) == 0 {
        return ""
    }

    movieGenre := ""
    for _, v := range result {
        movieGenre += v[1] + "/"
    }
    return strings.Trim(movieGenre, "/")
}

//獲取電影上映時間
func GetMovieOnTime(movieHtml string) string {
    reg := regexp.MustCompile(`<span.*?property="v:initialReleaseDate".*?>(.*?)</span>`)
    result := reg.FindAllStringSubmatch(movieHtml, -1)

    if len(result) == 0 {
        return ""
    }

    return string(result[0][1])
}

//獲取電影時長
func GetMovieRunningTime(movieHtml string) string {
    reg := regexp.MustCompile(`<span.*?property="v:runtime".*?>(.*?)</span>`)
    result := reg.FindAllStringSubmatch(movieHtml, -1)

    if len(result) == 0 {
        return ""
    }

    return string(result[0][1])
}

//獲取當前電影頁下對的所有相關電影url
func GetMovieUrls(movieHtml string) []string {
    reg := regexp.MustCompile(`<a.*?href="(https://movie.douban.com/.*?)"`)
    result := reg.FindAllStringSubmatch(movieHtml, -1)

    var movieSets []string
    for _, v := range result {
        movieSets = append(movieSets, v[1])
    }

    return movieSets
}
//redis.go
package models

import (
    "github.com/astaxie/goredis"
)

var (
    client goredis.Client
)

//定義常量
const (
    URL_QUEUE     = "url_queue"     //作為佇列標識
    URL_VISIT_SET = "url_visit_set" //記錄曾經訪問過的url
)

func ConnectRedis(addr string) {
    client.Addr = addr
}

//把提取的url放入佇列
func PutinQueue(url string) {
    client.Lpush(URL_QUEUE, []byte(url))
}

//從佇列中取
func PopfromQueue() string {
    res, err := client.Rpop(URL_QUEUE)
    if err != nil {
        panic(err)
    }
    return string(res)
}

// 把曾經訪問過的加入一個集合
func AddToSet(url string) {
    client.Sadd(URL_VISIT_SET, []byte(url))
}

//獲取佇列長度
func GetQueueLength() int {
    length, err := client.Llen(URL_QUEUE)
    if err != nil {
        return 0
    }

    return length
}

//判斷某個URL是否訪問過
func IsVisit(url string) bool {
    bIsVisit, err := client.Sismember(URL_VISIT_SET, []byte(url))
    if err != nil {
        return false
    }
    return bIsVisit

}

 

相關文章