python爬蟲—基本的模組,你一定要懂!!
前言
python爬蟲,web spider。爬取網站獲取網頁資料,並進行分析提取。
基本模組使用的是 urllib,urllib2,re,等模組
(一)基本用法,例子
(1)進行基本GET請求,獲取網頁html
#!coding=utf-8
import urllib
import urllib2
url = `http://www.baidu.com/`
# 獲取請求
request = urllib2.Request(url)
try:
# 根據request,得到返回response
response = urllib2.urlopen(request)
except urllib2.HTTPError, e:
if hasattr(e, `reason`):
print e.reason
# 讀取response的body
html = response.read()
# 讀取response的headers
headers = response.info()
(2)表單提交
#!coding=utf-8
import urllib2
import urllib
post_url = ``
post_data = urllib.urlencode({
`username`: `username`,
`password`: `password`,
})
post_headers = {
`User-Agent`: `Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:31.0) Gecko/20100101 Firefox/31.0`,
}
request = urllib2.Request(
url=post_url,
data=post_data,
headers=post_headers,
)
response = urllib2.urlopen(request)
html = response.read()
(3)
小編推薦一個學python的學習qun 491308659 驗證碼: 南燭
無論你是大牛還是小白,是想轉行還是想入行都可以來了解一起進步一起學習!裙內有開發工具,很多幹貨和技術資料分享!
#!coding=utf-8
import urllib2
import re
page_num = 1
url = `http://tieba.baidu.com/p/3238280985?see_lz=1&pn=`+str(page_num)
myPage = urllib2.urlopen(url).read().decode(`gbk`)
myRe = re.compile(r`class="d_post_content j_d_post_content ">(.*?)</div>`, re.DOTALL)
items = myRe.findall(myPage)
f = open(`baidu.txt`, `a+`)
import sys
reload(sys)
sys.setdefaultencoding(`utf-8`)
i = 0
texts = []
for item in items:
i += 1
print i
text = item.replace(`<br>`, ``)
text.replace(`
`, ``).replace(` `, ``) + `
`
print text
f.write(text)
f.close()
(4)
#coding:utf-8
```
模擬登陸163郵箱並下載郵件內容
```
import urllib
import urllib2
import cookielib
import re
import time
import json
class Email163:
header = {`User-Agent`:`Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6`}
user = ``
cookie = None
sid = None
mailBaseUrl=`http://twebmail.mail.163.com`
def __init__(self):
self.cookie = cookielib.CookieJar()
cookiePro = urllib2.HTTPCookieProcessor(self.cookie)
urllib2.install_opener(urllib2.build_opener(cookiePro))
def login(self,user,pwd):
```
登入
```
postdata = urllib.urlencode({
`username`:user,
`password`:pwd,
`type`:1
})
#注意版本不同,登入URL也不同
req = urllib2.Request(
url=`https://ssl.mail.163.com/entry/coremail/fcg/ntesdoor2?funcid=loginone&language=-1&passtype=1&iframe=1&product=mail163&from=web&df=email163&race=-2_45_-2_hz&module=&uid=`+user+`&style=10&net=t&skinid=null`,
data=postdata,
headers=self.header,
)
res = str(urllib2.urlopen(req).read())
#print res
patt = re.compile(`sid=([^"]+)`,re.I)
patt = patt.search(res)
uname = user.split(`@`)[0]
self.user = user
if patt:
self.sid = patt.group(1).strip()
#print self.sid
print `%s Login Successful.....`%(uname)
else:
print `%s Login failed....`%(uname)
def getInBox(self):
```
獲取郵箱列表
```
print `
Get mail lists.....
`
sid = self.sid
url = self.mailBaseUrl+`/jy3/list/list.do?sid=`+sid+`&fid=1&fr=folder`
res = urllib2.urlopen(url).read()
#獲取郵件列表
mailList = []
patt = re.compile(`<divs+class="tdLike Ibx_Td_From"[^>]+>.*?href="([^"]+)"[^>]+>(.*?)</a>.*?<divs+class="tdLike Ibx_Td_Subject"[^>]+>.*?href="[^>]+>(.*?)</a>`,re.I|re.S)
patt = patt.findall(res)
if patt==None:
return mailList
for i in patt:
line = {
`from`:i[1].decode(`utf8`),
`url`:self.mailBaseUrl+i[0],
`subject`:i[2].decode(`utf8`)
}
mailList.append(line)
return mailList
def getMailMsg(self,url):
```
下載郵件內容
```
content=``
print `
Download.....%s
`%(url)
res = urllib2.urlopen(url).read()
patt = re.compile(`contentURL:"([^"]+)"`,re.I)
patt = patt.search(res)
if patt==None:
return content
url = `%s%s`%(self.mailBaseUrl,patt.group(1))
time.sleep(1)
res = urllib2.urlopen(url).read()
Djson = json.JSONDecoder(encoding=`utf8`)
jsonRes = Djson.decode(res)
if `resultVar` in jsonRes:
content = Djson.decode(res)[`resultVar`]
time.sleep(3)
return content
```
Demon
```
#初始化
mail163 = Email163()
#登入
mail163.login(`lpe234@163.com`,`944898186`)
time.sleep(2)
#獲取收件箱
elist = mail163.getInBox()
#獲取郵件內容
for i in elist:
print `主題:%s 來自:%s 內容:
%s`%(i[`subject`].encode(`utf8`),i[`from`].encode(`utf8`)
(5)需要登陸的情況
#1 cookie的處理
import urllib2, cookielib
cookie_support= urllib2.HTTPCookieProcessor(cookielib.CookieJar())
opener = urllib2.build_opener(cookie_support, urllib2.HTTPHandler)
urllib2.install_opener(opener)
content = urllib2.urlopen(`http://XXXX`).read()
#2 用代理和cookie
opener = urllib2.build_opener(proxy_support, cookie_support, urllib2.HTTPHandler)
#3 表單的處理
import urllib
postdata=urllib.urlencode({
`username`:`XXXXX`,
`password`:`XXXXX`,
`continueURI`:`http://www.verycd.com/`,
`fk`:fk,
`login_submit`:`登入`
})
req = urllib2.Request(
url = `http://secure.verycd.com/signin/*/http://www.verycd.com/`,
data = postdata
)
result = urllib2.urlopen(req).read()
#4 偽裝成瀏覽器訪問
headers = {
`User-Agent`:`Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6`
}
req = urllib2.Request(
url = `http://secure.verycd.com/signin/*/http://www.verycd.com/`,
data = postdata,
headers = headers
)
#5 反”反盜鏈”
headers = {
`Referer`:`http://www.cnbeta.com/articles`
}
(6)多執行緒
from threading import Thread
from Queue import Queue
from time import sleep
#q是任務佇列
#NUM是併發執行緒總數
#JOBS是有多少任務
q = Queue()
NUM = 2
JOBS = 10
#具體的處理函式,負責處理單個任務
def do_somthing_using(arguments):
print arguments
#這個是工作程式,負責不斷從佇列取資料並處理
def working():
while True:
arguments = q.get()
do_somthing_using(arguments)
sleep(1)
q.task_done()
#fork NUM個執行緒等待佇列
for i in range(NUM):
t = Thread(target=working)
t.setDaemon(True)
t.start()
#把JOBS排入佇列
for i in range(JOBS):
q.put(i)
#等待所有JOBS完成
q.join()
相關文章
- python爬蟲requests模組Python爬蟲
- Python爬蟲之路-jsonpath模組Python爬蟲JSON
- Python爬蟲之路-lxml模組Python爬蟲XML
- python爬蟲需要什麼模組Python爬蟲
- Python爬蟲教程-09-error 模組Python爬蟲Error
- Python爬蟲:流程框架和常用模組Python爬蟲框架
- Python爬蟲(1.爬蟲的基本概念)Python爬蟲
- Python中爬蟲框架或模組的區別!Python爬蟲框架
- Python中爬蟲框架或模組的區別Python爬蟲框架
- 爬蟲——Requests模組爬蟲
- 爬蟲-Requests模組爬蟲
- python爬蟲:爬蟲的簡單介紹及requests模組的簡單使用Python爬蟲
- 爬蟲-urllib模組的使用爬蟲
- Python爬蟲詳解(一看就懂)Python爬蟲
- Python爬蟲之Selenium庫的基本使用Python爬蟲
- 爬蟲-urllib3模組的使用爬蟲
- Python案例學習——模組和包、爬蟲的基礎概念Python爬蟲
- Python中爬蟲模組有哪些?優缺點介紹!Python爬蟲
- python–模組之基本Python
- 網路爬蟲——Urllib模組實戰專案(含程式碼)爬取你的第一個網站爬蟲網站
- python爬蟲初探--第一個python爬蟲專案Python爬蟲
- 【Python學習】爬蟲爬蟲爬蟲爬蟲~Python爬蟲
- python爬蟲系列(三)scrapy基本概念Python爬蟲
- (python)爬蟲----八個專案帶你進入爬蟲的世界Python爬蟲
- Python2爬蟲利器:requests庫的基本用法Python爬蟲
- 爬蟲 | 非同步請求aiohttp模組爬蟲非同步AIHTTP
- 分散式爬蟲很難嗎?用Python寫一個小白也能聽懂的分散式知乎爬蟲分散式爬蟲Python
- 基本的爬蟲工作原理爬蟲
- 使用 nodejs 寫爬蟲(一): 常用模組和 js 語法NodeJS爬蟲
- 什麼是Python爬蟲?一篇文章帶你全面瞭解爬蟲Python爬蟲
- python爬蟲系列(4.5-使用urllib模組方式下載圖片)Python爬蟲
- Python "爬蟲"出發前的裝備之二資料先行( Requests 模組)Python爬蟲
- 為爬蟲框架構建Selenium模組、DSL模組(Kotlin實現)爬蟲框架架構Kotlin
- Python爬蟲進階之代理的基本原理Python爬蟲
- python爬蟲總是爬不到資料,你需要解決反爬蟲了Python爬蟲
- Java爬蟲與Python爬蟲的區別?Java爬蟲Python
- 帶你入門Python爬蟲,8個常用爬蟲技巧盤點Python爬蟲
- python爬蟲---網頁爬蟲,圖片爬蟲,文章爬蟲,Python爬蟲爬取新聞網站新聞Python爬蟲網頁網站