安裝Scrapy爬蟲框架
關於如何安裝Python以及Scrapy框架,這裡不做介紹,請自行網上搜尋。
初始化
安裝好Scrapy後,執行 scrapy startproject myspider
接下來你會看到 myspider 資料夾,目錄結構如下:
- scrapy.cfg
- myspider
- items.py
- pipelines.py
- settings.py
- __init__.py
- spiders
- __init__.py
編寫爬蟲檔案
在spiders目錄下新建 users.py
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 |
# -*- coding: utf-8 -*- import scrapy import os import time from zhihu.items import UserItem from zhihu.myconfig import UsersConfig # 爬蟲配置 class UsersSpider(scrapy.Spider): name = 'users' domain = 'https://www.zhihu.com' login_url = 'https://www.zhihu.com/login/email' headers = { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "Accept-Language": "zh-CN,zh;q=0.8", "Connection": "keep-alive", "Host": "www.zhihu.com", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.109 Safari/537.36" } def __init__(self, url = None): self.user_url = url def start_requests(self): yield scrapy.Request( url = self.domain, headers = self.headers, meta = { 'proxy': UsersConfig['proxy'], 'cookiejar': 1 }, callback = self.request_captcha ) def request_captcha(self, response): # 獲取_xsrf值 _xsrf = response.css('input[name="_xsrf"]::attr(value)').extract()[0] # 獲取驗證碼地址 captcha_url = 'http://www.zhihu.com/captcha.gif?r=' + str(time.time() * 1000) # 準備下載驗證碼 yield scrapy.Request( url = captcha_url, headers = self.headers, meta = { 'proxy': UsersConfig['proxy'], 'cookiejar': response.meta['cookiejar'], '_xsrf': _xsrf }, callback = self.download_captcha ) def download_captcha(self, response): # 下載驗證碼 with open('captcha.gif', 'wb') as fp: fp.write(response.body) # 用軟體開啟驗證碼圖片 os.system('start captcha.gif') # 輸入驗證碼 print 'Please enter captcha: ' captcha = raw_input() yield scrapy.FormRequest( url = self.login_url, headers = self.headers, formdata = { 'email': UsersConfig['email'], 'password': UsersConfig['password'], '_xsrf': response.meta['_xsrf'], 'remember_me': 'true', 'captcha': captcha }, meta = { 'proxy': UsersConfig['proxy'], 'cookiejar': response.meta['cookiejar'] }, callback = self.request_zhihu ) def request_zhihu(self, response): yield scrapy.Request( url = self.user_url + '/about', headers = self.headers, meta = { 'proxy': UsersConfig['proxy'], 'cookiejar': response.meta['cookiejar'], 'from': { 'sign': 'else', 'data': {} } }, callback = self.user_item, dont_filter = True ) yield scrapy.Request( url = self.user_url + '/followees', headers = self.headers, meta = { 'proxy': UsersConfig['proxy'], 'cookiejar': response.meta['cookiejar'], 'from': { 'sign': 'else', 'data': {} } }, callback = self.user_start, dont_filter = True ) yield scrapy.Request( url = self.user_url + '/followers', headers = self.headers, meta = { 'proxy': UsersConfig['proxy'], 'cookiejar': response.meta['cookiejar'], 'from': { 'sign': 'else', 'data': {} } }, callback = self.user_start, dont_filter = True ) def user_start(self, response): sel_root = response.xpath('//h2[@class="zm-list-content-title"]') # 判斷關注列表是否為空 if len(sel_root): for sel in sel_root: people_url = sel.xpath('a/@href').extract()[0] yield scrapy.Request( url = people_url + '/about', headers = self.headers, meta = { 'proxy': UsersConfig['proxy'], 'cookiejar': response.meta['cookiejar'], 'from': { 'sign': 'else', 'data': {} } }, callback = self.user_item, dont_filter = True ) yield scrapy.Request( url = people_url + '/followees', headers = self.headers, meta = { 'proxy': UsersConfig['proxy'], 'cookiejar': response.meta['cookiejar'], 'from': { 'sign': 'else', 'data': {} } }, callback = self.user_start, dont_filter = True ) yield scrapy.Request( url = people_url + '/followers', headers = self.headers, meta = { 'proxy': UsersConfig['proxy'], 'cookiejar': response.meta['cookiejar'], 'from': { 'sign': 'else', 'data': {} } }, callback = self.user_start, dont_filter = True ) def user_item(self, response): def value(list): return list[0] if len(list) else '' sel = response.xpath('//div[@class="zm-profile-header ProfileCard"]') item = UserItem() item['url'] = response.url[:-6] item['name'] = sel.xpath('//a[@class="name"]/text()').extract()[0].encode('utf-8') item['bio'] = value(sel.xpath('//span[@class="bio"]/@title').extract()).encode('utf-8') item['location'] = value(sel.xpath('//span[contains(@class, "location")]/@title').extract()).encode('utf-8') item['business'] = value(sel.xpath('//span[contains(@class, "business")]/@title').extract()).encode('utf-8') item['gender'] = 0 if sel.xpath('//i[contains(@class, "icon-profile-female")]') else 1 item['avatar'] = value(sel.xpath('//img[@class="Avatar Avatar--l"]/@src').extract()) item['education'] = value(sel.xpath('//span[contains(@class, "education")]/@title').extract()).encode('utf-8') item['major'] = value(sel.xpath('//span[contains(@class, "education-extra")]/@title').extract()).encode('utf-8') item['employment'] = value(sel.xpath('//span[contains(@class, "employment")]/@title').extract()).encode('utf-8') item['position'] = value(sel.xpath('//span[contains(@class, "position")]/@title').extract()).encode('utf-8') item['content'] = value(sel.xpath('//span[@class="content"]/text()').extract()).strip().encode('utf-8') item['ask'] = int(sel.xpath('//div[contains(@class, "profile-navbar")]/a[2]/span[@class="num"]/text()').extract()[0]) item['answer'] = int(sel.xpath('//div[contains(@class, "profile-navbar")]/a[3]/span[@class="num"]/text()').extract()[0]) item['agree'] = int(sel.xpath('//span[@class="zm-profile-header-user-agree"]/strong/text()').extract()[0]) item['thanks'] = int(sel.xpath('//span[@class="zm-profile-header-user-thanks"]/strong/text()').extract()[0]) yield item |
新增爬蟲配置檔案
在myspider目錄下新建myconfig.py,並新增以下內容,將你的配置資訊填入相應位置
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 |
# -*- coding: utf-8 -*- UsersConfig = { # 代理 'proxy': '', # 知乎使用者名稱和密碼 'email': 'your email', 'password': 'your password', } DbConfig = { # db config 'user': 'db user', 'passwd': 'db password', 'db': 'db name', 'host': 'db host', } |
修改items.py
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 |
# -*- coding: utf-8 -*- import scrapy class UserItem(scrapy.Item): # define the fields for your item here like: url = scrapy.Field() name = scrapy.Field() bio = scrapy.Field() location = scrapy.Field() business = scrapy.Field() gender = scrapy.Field() avatar = scrapy.Field() education = scrapy.Field() major = scrapy.Field() employment = scrapy.Field() position = scrapy.Field() content = scrapy.Field() ask = scrapy.Field() answer = scrapy.Field() agree = scrapy.Field() thanks = scrapy.Field() |
將使用者資料存入mysql資料庫
修改pipelines.py
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 |
# -*- coding: utf-8 -*- import MySQLdb import datetime from zhihu.myconfig import DbConfig class UserPipeline(object): def __init__(self): self.conn = MySQLdb.connect(user = DbConfig['user'], passwd = DbConfig['passwd'], db = DbConfig['db'], host = DbConfig['host'], charset = 'utf8', use_unicode = True) self.cursor = self.conn.cursor() # 清空表 # self.cursor.execute('truncate table weather;') # self.conn.commit() def process_item(self, item, spider): curTime = datetime.datetime.now() try: self.cursor.execute( """INSERT IGNORE INTO users (url, name, bio, location, business, gender, avatar, education, major, employment, position, content, ask, answer, agree, thanks, create_at) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)""", ( item['url'], item['name'], item['bio'], item['location'], item['business'], item['gender'], item['avatar'], item['education'], item['major'], item['employment'], item['position'], item['content'], item['ask'], item['answer'], item['agree'], item['thanks'], curTime ) ) self.conn.commit() except MySQLdb.Error, e: print 'Error %d %s' % (e.args[0], e.args[1]) return item |
修改settings.py
找到 ITEM_PIPELINES
,改為:
1 2 3 |
ITEM_PIPELINES = { 'myspider.pipelines.UserPipeline': 300, } |
在末尾新增,設定爬蟲的深度
1 |
DEPTH_LIMIT=10 |
爬取知乎使用者資料
確保MySQL已經開啟,在專案根目錄下開啟終端,
執行 scrapy crawl users -a url=https://www.zhihu.com/people/
,
其中user為爬蟲的第一個使用者,之後會根據該使用者關注的人和被關注的人進行爬取資料
接下來會下載驗證碼圖片,若未自動開啟,請到根目錄下開啟 captcha.gif,在終端輸入驗證碼
資料爬取Loading…
原始碼
原始碼可以在這裡找到 github