主檔案如下:
#coding=utf-8 import requests import urllib import urllib2 import cookielib import WeiboEncode import WeiboSearch import time import re import random import httplib class WeiboLogin: def __init__(self, user, pwd, enableProxy = False):#構造方法,引數依次是自身、使用者、密碼、是否使用代理伺服器 "初始化WeiboLogin,enableProxy表示是否使用代理伺服器,預設關閉" print "Initializing WeiboLogin..." self.userName = user self.passWord = pwd self.enableProxy = enableProxy self.serverUrl = "http://login.sina.com.cn/sso/prelogin.php?entry=weibo&callback=sinaSSOController.preloginCallBack&su=&rsakt=mod&client=ssologin.js(v1.4.11)&_=1379834957683" self.loginUrl = "http://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.11)" self.postHeader = {`User-Agent`: `Mozilla/5.0 (Windows NT 6.1; rv:24.0) Gecko/20100101 Firefox/24.0`} #使用者代理 User Agent,是指瀏覽器,它的資訊包括硬體平臺、系統軟體、應用軟體和使用者個人偏好。 def Login(self):#登陸程式 "登陸程式" self.EnableCookie(self.enableProxy)#cookie或代理伺服器配置 serverTime, nonce, pubkey, rsakv = self.GetServerTime()#登陸的第一步 postData = WeiboEncode.PostEncode(self.userName, self.passWord, serverTime, nonce, pubkey, rsakv)#加密使用者和密碼 print "Post data length: ", len(postData) req = urllib2.Request(self.loginUrl, postData, self.postHeader)#構造網路請求 print "Posting request..." result = urllib2.urlopen(req)#發出網路請求 text = result.read() try: loginUrl = WeiboSearch.sRedirectData(text)#解析重定位結果(登陸後自動跳轉到的頁面) urllib2.urlopen(loginUrl) except: print `Login error!` return False print `Login sucess!` return True def EnableCookie(self, enableProxy):#"Enable cookie & proxy (if needed)." cookiejar = cookielib.LWPCookieJar()#建立cookie cookie_support = urllib2.HTTPCookieProcessor(cookiejar) #HTTPCookieProcessor instances have one attribute: #HTTPCookieProcessor.cookiejar (The cookielib.CookieJar in which cookies are stored.) if enableProxy: proxy_support = urllib2.ProxyHandler({`http`:`59.59.100.123:8118`})#使用代理 opener = urllib2.build_opener(proxy_support, cookie_support, urllib2.HTTPHandler)#Return an OpenerDirector instance #The OpenerDirector class opens URLs via BaseHandlers chained together. print "Proxy enabled" else: opener = urllib2.build_opener(cookie_support, urllib2.HTTPHandler) urllib2.install_opener(opener)#構建cookie對應的opener def GetServerTime(self):#"Get server time and nonce, which are used to encode the password" #在摘要認證中伺服器讓客戶選一個隨機數(稱作”nonce“),然後瀏覽器使用一個單向的加密函式生成一個訊息摘要(message #digest),該摘要是關於使用者名稱、密碼、給定的nonce值、HTTP方法,以及所請求的URL。 print "Getting server time and nonce..." serverData = urllib2.urlopen(self.serverUrl).read()#得到網頁內容 print serverData try: serverTime, nonce, pubkey, rsakv = WeiboSearch.sServerData(serverData)#解析得到serverTime,nonce等 return serverTime, nonce, pubkey, rsakv except: print `Get server time & nonce error!` return None def fetch_weibo(id, filename):#不借助API取回微博列表,但只有前幾條,引數分別為使用者ID、檔名 target = open(filename, `a`) myurl=`http://weibo.com/u/`+id line = urllib2.urlopen(myurl).read() target.write(line) if re.search(r`"WB_detail`, line): print "success" p = re.compile(r`"WB_detail"`) linelist = p.split(line) for fraction in linelist: matchObj = re.search(r`nick-name=".+?">\n +(.+?)<`, fraction) if matchObj: target.write(matchObj.group(1)) target.write(" ") def fetchqueryresult():#本方法可取回微博找人的查詢結果 myurl="http://s.weibo.com/user/&auth=ord&age=22y&gender=women®ion=custom:33:1&page="#找人頁面的url target = open("filename", `a`)#輸出檔名稱 for i in range(37,51):#起止頁碼 line = urllib2.urlopen(myurl).read() while re.search(r`ids=(d+?)\`, line): matchObj = re.search(r`ids=(d+?)\`, line) print matchObj.group(1) target.write(matchObj.group(1)) target.write(" ") p = re.compile(r``+matchObj.group(1)) linelist = p.split(line) line = linelist[len(linelist)-1] print i time.sleep(2+random.random()); def getjson():#本方法可呼叫微博API,取回已登入使用者的微博列表 headers = {`User-Agent`: `Mozilla/5.0 (Windows NT 6.1; rv:24.0) Gecko/20100101 Firefox/24.0`}#定義一些檔案頭 url = "https://api.weibo.com/2/statuses/user_timeline.json" # 這裡是url your_param = {`source`: `1675437817`} # 這裡是請求引數! result = requests.get(url, params=your_param) # 傳送請求,如果url是http://s.weibo.com/weibo/s 那麼 這句話的的效果就是 http://s.weibo.com/weibo/s?Refer=sina_index result_final = result.text #這樣就獲取到了你傳送的這個URL + 引數 之後的結果 print result.text if __name__ == `__main__`: #if the python interpreter is running that module (the source file) as the main program, #it sets the special __name__ variable to have a value #"__main__". #If this file is being imported from another module, #__name__ will be set to the module`s name. weiboLogin = WeiboLogin(`tanglie23@163.com`, `XXXXXXXX`)#郵箱(賬號)、密碼 if weiboLogin.Login() == True: print "登陸成功!" myurl="http://api.weibo.com/2/statuses/timeline_batch.json?source=1675437817&uids=5029941840" htmlContent = urllib2.urlopen(myurl).read() print htmlContent
另外要用到的兩個類是WeiboSearch.py和WeiboEncode.py。
WeiboEncode.py程式碼如下:
#coding=utf-8 import urllib import base64 import rsa import binascii def PostEncode(userName, passWord, serverTime, nonce, pubkey, rsakv): "Used to generate POST data" encodedUserName = GetUserName(userName)#使用者名稱使用base64加密 encodedPassWord = get_pwd(passWord, serverTime, nonce, pubkey)#目前密碼採用rsa加密 postPara = { `entry`: `weibo`, `gateway`: `1`, `from`: ``, `savestate`: `7`, `userticket`: `1`, `ssosimplelogin`: `1`, `vsnf`: `1`, `vsnval`: ``, `su`: encodedUserName, `service`: `miniblog`, `servertime`: serverTime, `nonce`: nonce, `pwencode`: `rsa2`, `sp`: encodedPassWord, `encoding`: `UTF-8`, `prelt`: `115`, `rsakv`: rsakv, `url`: `http://weibo.com/ajaxlogin.php?framelogin=1&callback=parent.sinaSSOController.feedBackUrlCallBack`, `returntype`: `META` } postData = urllib.urlencode(postPara)#網路編碼 return postData def GetUserName(userName): "Used to encode user name" userNameTemp = urllib.quote(userName) userNameEncoded = base64.encodestring(userNameTemp)[:-1] return userNameEncoded def get_pwd(password, servertime, nonce, pubkey): rsaPublickey = int(pubkey, 16) key = rsa.PublicKey(rsaPublickey, 65537) #建立公鑰 message = str(servertime) + ` ` + str(nonce) + ` ` + str(password) #拼接明文js加密檔案中得到 passwd = rsa.encrypt(message, key) #加密 passwd = binascii.b2a_hex(passwd) #將加密資訊轉換為16進位制。 return passwd
WeiboSearch.py
#coding=utf-8 import re import json def sServerData(serverData):#解析得到serverTime,nonce等 "Search the server time & nonce from server data" p = re.compile(`((.*))`) #re.compile 可以把正規表示式編譯成一個正規表示式物件 jsonData = p.search(serverData).group(1) #查詢 data = json.loads(jsonData) #對encodedjson進行decode,得到原始資料,需要使用json.loads()函式 serverTime = str(data[`servertime`]) nonce = data[`nonce`] pubkey = data[`pubkey`]# rsakv = data[`rsakv`]# print "Server time is:", serverTime print "Nonce is:", nonce return serverTime, nonce, pubkey, rsakv def sRedirectData(text): p = re.compile(`location.replace([`"](.*?)[`"])`) loginUrl = p.search(text).group(1) print `loginUrl:`,loginUrl return loginUrl
目前該爬蟲可以自動登入,以及呼叫新浪微博的普通API。但是批量取回他人的微博需要高階授權,目前正在申請。