用python實現的抓取騰訊視訊所有電影的爬蟲

pythontab發表於2013-08-23

原文網址 : https://www.pythontab.com/html/2013/pythonhexinbiancheng_0823/547.html

用python實現的抓取騰訊視訊所有電影的爬蟲（文章不錯，所以進行了轉載）

# -*- coding: utf-8 -*-
import re
import urllib2
from bs4 import BeautifulSoup
import string, time
import pymongo
 
NUM     = 0         #全域性變數,電影數量
m_type  = u''       #全域性變數,電影型別
m_site  = u'qq' #全域性變數,電影網站
 
#根據指定的URL獲取網頁內容
def gethtml(url):
    req = urllib2.Request(url) 
    response = urllib2.urlopen(req) 
    html = response.read()
    return html
 
#從電影分類列表頁面獲取電影分類
def gettags(html):
    global m_type
    soup = BeautifulSoup(html)      #過濾出分類內容
    #print soup
    #<ul class="clearfix _group" gname="mi_type" gtype="1">
    tags_all = soup.find_all('ul', {'class' : 'clearfix _group' , 'gname' : 'mi_type'})
    #print len(tags_all), tags_all
    #print str(tags_all[1]).replace('\n', '')
 
    #<a _hot="tag.sub" class="_gtag _hotkey" href="http://v.qq.com/list/1_0_-1_-1_1_0_0_20_0_-1_0.html" title="動作" tvalue="0">動作</a>
    re_tags = r'<a _hot=\"tag\.sub\" class=\"_gtag _hotkey\" href=\"(.+?)\" title=\"(.+?)\" tvalue=\"(.+?)\">.+?</a>'
    p = re.compile(re_tags, re.DOTALL)
 
    tags = p.findall(str(tags_all[0]))
    if tags:
        tags_url = {}
        #print tags
        for tag in tags:
            tag_url = tag[0].decode('utf-8')
            #print tag_url
            m_type = tag[1].decode('utf-8')
            tags_url[m_type] = tag_url 
             
    else:
            print "Not Find"
    return tags_url
 
#獲取每個分類的頁數
def get_pages(tag_url):
    tag_html = gethtml(tag_url)
    #div class="paginator
    soup = BeautifulSoup(tag_html)      #過濾出標記頁面的html
    #print soup
    #<div class="mod_pagenav" id="pager">
    div_page = soup.find_all('div', {'class' : 'mod_pagenav', 'id' : 'pager'})
    #print div_page #len(div_page), div_page[0]
 
    #<a class="c_txt6" href="http://v.qq.com/list/1_2_-1_-1_1_0_24_20_0_-1_0.html" title="25"><span>25</span></a>
    re_pages = r'<a class=.+?><span>(.+?)</span></a>'
    p = re.compile(re_pages, re.DOTALL)
    pages = p.findall(str(div_page[0]))
    #print pages
    if len(pages) > 1:
        return pages[-2]
    else:
        return 1
     
 
def getmovielist(html):
    soup = BeautifulSoup(html)
 
    #<ul class="mod_list_pic_130">
    divs = soup.find_all('ul', {'class' : 'mod_list_pic_130'})
    #print divs
    for div_html in divs:
        div_html = str(div_html).replace('\n', '')
        #print div_html
        getmovie(div_html)
 
 
def getmovie(html):
    global NUM
    global m_type
    global m_site
 
    re_movie = r'<li><a class=\"mod_poster_130\" href=\"(.+?)\" target=\"_blank\" title=\"(.+?)\"><img.+?</li>'
    p = re.compile(re_movie, re.DOTALL)
    movies = p.findall(html)
    if movies:
        conn = pymongo.Connection('localhost', 27017)
        movie_db = conn.dianying
        playlinks = movie_db.playlinks
        #print movies
        for movie in movies:
            #print movie
            NUM += 1
            print "%s : %d" % ("=" * 70, NUM)
            values = dict(
                movie_title = movie[1],
                movie_url   = movie[0],
                movie_site      = m_site,
                movie_type      = m_type
                )
            print values
            playlinks.insert(values)
            print "_" * 70
            NUM += 1
            print "%s : %d" % ("=" * 70, NUM)
 
    #else:
    #   print "Not Find"
 
def getmovieinfo(url):
    html = gethtml(url)
    soup = BeautifulSoup(html)
 
    #pack pack_album album_cover
    divs = soup.find_all('div', {'class' : 'pack pack_album album_cover'})
    #print divs[0]
 
    #<a href="http://www.tudou.com/albumplay/9NyofXc_lHI/32JqhiKJykI.html" target="new" title="《血滴子》獨家紀錄片" wl="1"> </a> 
    re_info = r'<a href=\"(.+?)\" target=\"new\" title=\"(.+?)\" wl=\".+?\"> </a>'
    p_info = re.compile(re_info, re.DOTALL)
    m_info = p_info.findall(str(divs[0]))
    if m_info:
        return m_info
    else:
        print "Not find movie info"
 
    return m_info
 
 
def insertdb(movieinfo):
    global conn
    movie_db = conn.dianying_at
    movies = movie_db.movies
    movies.insert(movieinfo)
 
if __name__ == "__main__":
    global conn
 
    tags_url = "http://v.qq.com/list/1_-1_-1_-1_1_0_0_20_0_-1_0.html"
    #print tags_url
    tags_html = gethtml(tags_url)
    #print tags_html
    tag_urls = gettags(tags_html)
    #print tag_urls
 
 
    for url in tag_urls.items():
        print  str(url[1]).encode('utf-8') #,url[0]
        maxpage = int(get_pages(str(url[1]).encode('utf-8')))
        print maxpage
 
        for x in range(0, maxpage):
            #http://v.qq.com/list/1_0_-1_-1_1_0_0_20_0_-1_0.html
            m_url = str(url[1]).replace('0_20_0_-1_0.html', '')
            movie_url = "%s%d_20_0_-1_0.html" % (m_url, x)
            print movie_url
            movie_html = gethtml(movie_url.encode('utf-8'))
            #print movie_html
            getmovielist(movie_html)
            time.sleep(0.1)

Python爬蟲抓取股票資訊
2021-01-03
Python爬蟲
Python爬蟲抓取知乎所有使用者資訊
2018-03-14
Python爬蟲
python 爬取騰訊視訊的全部評論
2021-02-17
Python
python爬蟲抓取哈爾濱天氣資訊（靜態爬蟲）
2020-04-05
Python爬蟲
Python爬蟲入門實戰之貓眼電影資料抓取（實戰篇）
2019-04-07
Python爬蟲
使用Scrapy抓取優酷視訊列表頁（電影/電視）
2019-02-16
新手寫的視訊爬蟲
2020-12-16
爬蟲
用Python爬蟲抓取代理IP
2019-04-17
Python爬蟲
Python爬蟲入門實戰之貓眼電影資料抓取(理論篇)
2019-04-06
Python爬蟲
爬蟲app資訊抓取之apk反編譯抓取
2019-05-10
爬蟲APPAPK編譯
Python爬蟲二：抓取京東商品列表頁面資訊
2018-06-26
Python爬蟲
爬蟲01:爬取豆瓣電影TOP 250基本資訊
2020-12-29
爬蟲
python網路爬蟲_Python爬蟲：30個小時搞定Python網路爬蟲視訊教程
2020-10-21
Python爬蟲
Python爬蟲實戰：爬取淘寶的商品資訊
2021-09-11
Python爬蟲
【Python3網路爬蟲開發實戰】3.4-抓取貓眼電影排行
2019-07-04
Python爬蟲
python愛奇藝VIP視訊爬蟲爬取下載
2018-04-20
Python爬蟲
Python爬取所有人位置資訊——騰訊位置大資料！
2020-11-13
Python大資料
Scrapy爬蟲：實習僧網最新招聘資訊抓取
2021-09-09
爬蟲
Python爬蟲入門教程 50-100 Python3爬蟲爬取VIP視訊-Python爬蟲6操作
2019-02-14
Python爬蟲
Python爬蟲抓取技術的門道
2019-09-21
Python爬蟲
用node+puppeteer騰訊視訊爬取例項
2019-03-13
2019騰訊視訊年度指數報告–電影篇
2019-12-29
python爬蟲--招聘資訊
2018-11-03
Python爬蟲
Python3 網路爬蟲實戰的視訊和掃描版書
2018-11-14
Python爬蟲
利用 Python 爬蟲實現快遞物流資訊查詢
2020-09-25
Python爬蟲
如何爬取視訊的爬蟲程式碼原始碼
2020-12-26
爬蟲原始碼
【Python3網路爬蟲開發實戰】3-基本庫的使用-4抓取貓眼電影排行
2019-02-13
Python爬蟲
視訊教程-Python網路爬蟲開發與專案實戰-Python
2020-05-28
Python爬蟲
python如何抓取手機app上的視訊
2021-12-07
PythonAPP
python的爬蟲功能如何實現
2019-02-28
Python爬蟲
Python爬蟲是如何實現的？
2022-07-15
Python爬蟲
python爬蟲練習之爬取豆瓣讀書所有標籤下的書籍資訊
2018-07-23
Python爬蟲
Python爬蟲實戰之（二）| 尋找你的招聘資訊
2018-04-28
Python爬蟲
Python爬蟲訓練：爬取酷燃網視訊資料
2020-10-23
Python爬蟲
python爬蟲練習--爬取虎牙主播原畫視訊
2020-11-28
Python爬蟲
Python3爬取貓眼電影資訊
2020-11-06
Python
Python抓取VIP電影
2019-01-22
Python
實戰案例｜拒絕資訊洩露，騰訊雲助力電商對抗網路爬蟲
2021-10-14
爬蟲
Python實現拼多多商品資訊抓取方法
2023-10-10
Python

用python實現的抓取騰訊視訊所有電影的爬蟲

相關文章