本文轉自公眾號：[ 法納斯特]

12月已開始了，離2018年的結束也就半個多月的時間了，還記得年初立下的flag嗎？

完成了多少？相信很多人和我一樣，抱頭痛哭...

本次利用貓眼電影，實現對2018年的電影大資料進行分析。

【Python爬蟲&資料分析】2018年電影，你看了幾部？

/ 01 / 網頁分析

01 標籤

【Python爬蟲&資料分析】2018年電影，你看了幾部？

透過點選貓眼電影已經歸類好的標籤，得到網址資訊。

02 索引頁

【Python爬蟲&資料分析】2018年電影，你看了幾部？

開啟開發人員工具，獲取索引頁裡電影的連結以及評分資訊。

索引頁一共有30多頁，但是有電影評分的只有10頁。

本次只對有電影評分的資料進行獲取。

03 詳情頁

【Python爬蟲&資料分析】2018年電影，你看了幾部？

對詳情頁的資訊進行獲取。

主要是名稱，型別，國家，時長，上映時間，評分，評分人數，累計票房。

/ 02 / 反爬破解

【Python爬蟲&資料分析】2018年電影，你看了幾部？

透過開發人員工具發現，貓眼針對評分，評分人數，累計票房的資料，施加了文字反爬。

【Python爬蟲&資料分析】2018年電影，你看了幾部？

透過檢視網頁原始碼，發現只要重新整理頁面，三處文字編碼就會改變，無法直接匹配資訊。

所以需要下載文字檔案，對其進行雙匹配。

from fontTools.ttLib import TTFont

#font = TTFont('base.woff')
#font.saveXML('base.xml')
font = TTFont('maoyan.woff')
font.saveXML('maoyan.xml')

將woff格式轉換為xml格式，以便在Pycharm中檢視詳細資訊。

利用下面這個網站，開啟woff檔案。

url:

可以得到下面數字部分資訊(上下兩塊)。

在Pycharm中檢視xml格式檔案(左右兩塊)，你就會發現有對應資訊。

【Python爬蟲&資料分析】2018年電影，你看了幾部？

透過上圖你就可以將數字6對上號了，其他數字一樣的。

def get_numbers(u):
    """
    對貓眼的文字反爬進行破解
    """
    cmp = re.compile(",\n           url\('(//.*.woff)'\) format\('woff'\)")
    rst = cmp.findall(u)
    ttf = requests.get("http:" + rst[0], stream=True)
    with open("maoyan.woff", "wb") as pdf:
        for chunk in ttf.iter_content(chunk_size=1024):
            if chunk:
                pdf.write(chunk)
    base_font = TTFont('base.woff')
    maoyanFont = TTFont('maoyan.woff')
    maoyan_unicode_list = maoyanFont['cmap'].tables[0].ttFont.getGlyphOrder()
    maoyan_num_list = []
    base_num_list = ['.', '3', '0', '8', '9', '4', '1', '5', '2', '7', '6']
    base_unicode_list = ['x', 'uniF561', 'uniE6E1', 'uniF125', 'uniF83F', 'uniE9E2', 'uniEEA6', 'uniEEC2', 'uniED38', 'uniE538', 'uniF8E7']
    for i in range(1, 12):
        maoyan_glyph = maoyanFont['glyf'][maoyan_unicode_list[i]]
        for j in range(11):
            base_glyph = base_font['glyf'][base_unicode_list[j]]
            if maoyan_glyph == base_glyph:
                maoyan_num_list.append(base_num_list[j])
                break
    maoyan_unicode_list[1] = 'uni0078'
    utf8List = [eval(r"'\u" + uni[3:] + "'").encode("utf-8") for uni in maoyan_unicode_list[1:]]
    utf8last = []
    for i in range(len(utf8List)):
        utf8List[i] = str(utf8List[i], encoding='utf-8')
        utf8last.append(utf8List[i])
    return (maoyan_num_list ,utf8last)

/ 03 / 資料獲取

01 構造請求頭

head = """
Accept:text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8
Accept-Encoding:gzip, deflate, br
Accept-Language:zh-CN,zh;q=0.8
Cache-Control:max-age=0
Connection:keep-alive
Host:maoyan.com
Upgrade-Insecure-Requests:1
Content-Type:application/x-www-form-urlencoded; charset=UTF-8
User-Agent:Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.86 Safari/537.36
"""

def str_to_dict(header):
    """
    構造請求頭,可以在不同函式里構造不同的請求頭
    """
    header_dict = {}
    header = header.split('\n')
    for h in header:
        h = h.strip()
        if h:
            k, v = h.split(':', 1)
            header_dict[k] = v.strip()
    return header_dict

因為索引頁和詳情頁請求頭不一樣，這裡為了簡便，構造了一個函式。

02 獲取電影詳情頁連結

def get_url():
    """
    獲取電影詳情頁連結
    """
    for i in range(0, 300, 30):
        time.sleep(10)
        url = ' + str(i)
        host = """Referer:
        """
        header = head + host
        headers = str_to_dict(header)
        response = requests.get(url=url, headers=headers)
        html = response.text
        soup = BeautifulSoup(html, 'html.parser')
        data_1 = soup.find_all('div', {'class': 'channel-detail movie-item-title'})
        data_2 = soup.find_all('div', {'class': 'channel-detail channel-detail-orange'})
        num = 0
        for item in data_1:
            num += 1
            time.sleep(10)
            url_1 = item.select('a')[0]['href']
            if data_2[num-1].get_text() != '暫無評分':
                url = ' + url_1
                for message in get_message(url):
                    print(message)
                    to_mysql(message)
                print(url)
                print('---------------^^^Film_Message^^^-----------------')
            else:
                print('The Work Is Done')
                break

03 獲取電影詳情頁資訊

def get_message(url):
    """
    獲取電影詳情頁裡的資訊
    """
    time.sleep(10)
    data = {}
    host = """refer: 
    """
    header = head + host
    headers = str_to_dict(header)
    response = requests.get(url=url, headers=headers)
    u = response.text
    # 破解貓眼文字反爬
    (mao_num_list, utf8last) = get_numbers(u)
    # 獲取電影資訊
    soup = BeautifulSoup(u, "html.parser")
    mw = soup.find_all('span', {'class': 'stonefont'})
    score = soup.find_all('span', {'class': 'score-num'})
    unit = soup.find_all('span', {'class': 'unit'})
    ell = soup.find_all('li', {'class': 'ellipsis'})
    name = soup.find_all('h3', {'class': 'name'})
    # 返回電影資訊
    data["name"] = name[0].get_text()
    data["type"] = ell[0].get_text()
    data["country"] = ell[1].get_text().split('/')[0].strip().replace('\n', '')
    data["length"] = ell[1].get_text().split('/')[1].strip().replace('\n', '')
    data["released"] = ell[2].get_text()[:10]
    # 因為會出現沒有票房的電影,所以這裡需要判斷
    if unit:
        bom = ['分', score[0].get_text().replace('.', '').replace('萬', ''), unit[0].get_text()]
        for i in range(len(mw)):
            moviewish = mw[i].get_text().encode('utf-8')
            moviewish = str(moviewish, encoding='utf-8')
            # 透過比對獲取反爬文字資訊
            for j in range(len(utf8last)):
                moviewish = moviewish.replace(utf8last[j], maoyan_num_list[j])
            if i == 0:
                data["score"] = moviewish + bom[i]
            elif i == 1:
                if '萬' in moviewish:
                    data["people"] = int(float(moviewish.replace('萬', '')) * 10000)
                else:
                    data["people"] = int(float(moviewish))
            else:
                if '萬' == bom[i]:
                    data["box_office"] = int(float(moviewish) * 10000)
                else:
                    data["box_office"] = int(float(moviewish) * 100000000)
    else:
        bom = ['分', score[0].get_text().replace('.', '').replace('萬', ''), 0]
        for i in range(len(mw)):
            moviewish = mw[i].get_text().encode('utf-8')
            moviewish = str(moviewish, encoding='utf-8')
            for j in range(len(utf8last)):
                moviewish = moviewish.replace(utf8last[j], maoyan_num_list[j])
            if i == 0:
                data["score"] = moviewish + bom[i]
            else:
                if '萬' in moviewish:
                    data["people"] = int(float(moviewish.replace('萬', '')) * 10000)
                else:
                    data["people"] = int(float(moviewish))
        data["box_office"] = bom[2]
    yield data

/ 04 / 資料儲存

01 建立資料庫及表格

db = pymysql.connect(host='127.0.0.1', user='root', password='774110919', port=3306)
cursor = db.cursor()
cursor.execute("CREATE DATABASE maoyan DEFAULT CHARACTER SET utf8mb4")
db.close()

db = pymysql.connect(host='127.0.0.1', user='root', password='774110919', port=3306, db='maoyan')
cursor = db.cursor()
sql = 'CREATE TABLE IF NOT EXISTS films (name VARCHAR(255) NOT NULL, type VARCHAR(255) NOT NULL, country VARCHAR(255) NOT NULL, length VARCHAR(255) NOT NULL, released VARCHAR(255) NOT NULL, score VARCHAR(255) NOT NULL, people INT NOT NULL, box_office BIGINT NOT NULL, PRIMARY KEY (name))'
cursor.execute(sql)
db.close()

其中票房收入資料型別為BIGINT(19位數)，最大為18446744073709551615。

INT(10位數)，最大為2147483647，達不到36億(3600000000)。

02 資料儲存

def to_mysql(data):
    """
    資訊寫入mysql
    """
    table = 'films'
    keys = ', '.join(data.keys())
    values = ', '.join(['%s'] * len(data))
    db = pymysql.connect(host='localhost', user='root', password='774110919', port=3306, db='maoyan')
    cursor = db.cursor()
    sql = 'INSERT INTO {table}({keys}) VALUES ({values})'.format(table=table, keys=keys, values=values)
    try:
        if cursor.execute(sql, tuple(data.values())):
            print("Successful")
            db.commit()
    except:
        print('Failed')
        db.rollback()
    db.close()