因為要做觀點,觀點的屋子類似於知乎的話題,所以得想辦法把他給爬下來,搞了半天最終還是妥妥的搞定了,程式碼是python寫的,不懂得麻煩自學哈!懂得直接看程式碼,絕對可用
#coding:utf-8
"""
@author:haoning
@create time:2015.8.5
"""
from __future__ import division # 精確除法
from Queue import Queue
from __builtin__ import False
import json
import os
import re
import platform
import uuid
import urllib
import urllib2
import sys
import time
import MySQLdb as mdb
from bs4 import BeautifulSoup
reload(sys)
sys.setdefaultencoding( "utf-8" )
headers = {
`User-Agent` : `Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:35.0) Gecko/20100101 Firefox/35.0`,
`Content-Type`:`application/x-www-form-urlencoded; charset=UTF-8`,
`X-Requested-With`:`XMLHttpRequest`,
`Referer`:`https://www.zhihu.com/topics`,
`Cookie`:`__utma=51854390.517069884.1416212035.1416212035.1416212035.1; q_c1=c02bf44d00d240798bfabcfc95baeb56|1455778173000|1416205243000; _za=b1c8ae35-f986-46a2-b24a-cb9359dc6b2a; aliyungf_tc=AQAAAJ1m71jL1woArKqF22VFnL/wRy6C; _xsrf=9d494558f9271340ab24598d85b2a3c8; cap_id="MDNiMjcwM2U0MTRhNDVmYjgxZWVhOWI0NTA2OGU5OTg=|1455864276|2a4ce8247ebd3c0df5393bb5661713ad9eec01dd"; n_c=1; _alicdn_sec=56c6ba4d556557d27a0f8c876f563d12a285f33a`
}
DB_HOST = `127.0.0.1`
DB_USER = `root`
DB_PASS = `root`
queue= Queue() #接收佇列
nodeSet=set()
keywordSet=set()
stop=0
offset=-20
level=0
maxLevel=7
counter=0
base=""
conn = mdb.connect(DB_HOST, DB_USER, DB_PASS, `zhihu`, charset=`utf8`)
conn.autocommit(False)
curr = conn.cursor()
def get_html(url):
try:
req = urllib2.Request(url)
response = urllib2.urlopen(req,None,3) #在這裡應該加入代理
html = response.read()
return html
except:
pass
return None
def getTopics():
url = `https://www.zhihu.com/topics`
print url
try:
req = urllib2.Request(url)
response = urllib2.urlopen(req) #鍦ㄨ繖閲屽簲璇ュ姞鍏ヤ唬鐞�
html = response.read().decode(`utf-8`)
print html
soup = BeautifulSoup(html)
lis = soup.find_all(`li`, {`class` : `zm-topic-cat-item`})
for li in lis:
data_id=li.get(`data-id`)
name=li.text
curr.execute(`select id from classify_new where name=%s`,(name))
y= curr.fetchone()
if not y:
curr.execute(`INSERT INTO classify_new(data_id,name)VALUES(%s,%s)`,(data_id,name))
conn.commit()
except Exception as e:
print "get topic error",e
def get_extension(name):
where=name.rfind(`.`)
if where!=-1:
return name[where:len(name)]
return None
def which_platform():
sys_str = platform.system()
return sys_str
def GetDateString():
when=time.strftime(`%Y-%m-%d`,time.localtime(time.time()))
foldername = str(when)
return foldername
def makeDateFolder(par,classify):
try:
if os.path.isdir(par):
newFolderName=par + `//` + GetDateString() + `//` +str(classify)
if which_platform()=="Linux":
newFolderName=par + `/` + GetDateString() + "/" +str(classify)
if not os.path.isdir( newFolderName ):
os.makedirs( newFolderName )
return newFolderName
else:
return None
except Exception,e:
print "kk",e
return None
def download_img(url,classify):
try:
extention=get_extension(url)
if(extention is None):
return None
req = urllib2.Request(url)
resp = urllib2.urlopen(req,None,3)
dataimg=resp.read()
name=str(uuid.uuid1()).replace("-","")+"_www.guandn.com"+extention
top="E://topic_pic"
folder=makeDateFolder(top, classify)
filename=None
if folder is not None:
filename =folder+"//"+name
try:
if "e82bab09c_m" in str(url):
return True
if not os.path.exists(filename):
file_object = open(filename,`w+b`)
file_object.write(dataimg)
file_object.close()
return `/room/default/`+GetDateString()+`/`+str(classify)+"/"+name
else:
print "file exist"
return None
except IOError,e1:
print "e1=",e1
pass
except Exception as e:
print "eee",e
pass
return None #如果沒有下載下來就利用原來網站的連結
def getChildren(node,name):
global queue,nodeSet
try:
url="https://www.zhihu.com/topic/"+str(node)+"/hot"
html=get_html(url)
if html is None:
return
soup = BeautifulSoup(html)
p_ch=`父話題`
node_name=soup.find(`div`, {`id` : `zh-topic-title`}).find(`h1`).text
topic_cla=soup.find(`div`, {`class` : `child-topic`})
if topic_cla is not None:
try:
p_ch=str(topic_cla.text)
aList = soup.find_all(`a`, {`class` : `zm-item-tag`}) #獲取所有子節點
if u`子話題` in p_ch:
for a in aList:
token=a.get(`data-token`)
a=str(a).replace(`
`,``).replace(` `,``).replace(`
`,``)
start=str(a).find(`>`)
end=str(a).rfind(`</a>`)
new_node=str(str(a)[start+1:end])
curr.execute(`select id from rooms where name=%s`,(new_node)) #先保證名字絕不相同
y= curr.fetchone()
if not y:
print "y=",y,"new_node=",new_node,"token=",token
queue.put((token,new_node,node_name))
except Exception as e:
print "add queue error",e
except Exception as e:
print "get html error",e
def getContent(n,name,p,top_id):
try:
global counter
curr.execute(`select id from rooms where name=%s`,(name)) #先保證名字絕不相同
y= curr.fetchone()
print "exist?? ",y,"n=",n
if not y:
url="https://www.zhihu.com/topic/"+str(n)+"/hot"
html=get_html(url)
if html is None:
return
soup = BeautifulSoup(html)
title=soup.find(`div`, {`id` : `zh-topic-title`}).find(`h1`).text
pic_path=soup.find(`a`,{`id`:`zh-avartar-edit-form`}).find(`img`).get(`src`)
description=soup.find(`div`,{`class`:`zm-editable-content`})
if description is not None:
description=description.text
if (u"未歸類" in title or u"根話題" in title): #允許入庫,避免死迴圈
description=None
tag_path=download_img(pic_path,top_id)
print "tag_path=",tag_path
if (tag_path is not None) or tag_path==True:
if tag_path==True:
tag_path=None
father_id=2 #預設為雜談
curr.execute(`select id from rooms where name=%s`,(p))
results = curr.fetchall()
for r in results:
father_id=r[0]
name=title
curr.execute(`select id from rooms where name=%s`,(name)) #先保證名字絕不相同
y= curr.fetchone()
print "store see..",y
if not y:
friends_num=0
temp = time.time()
x = time.localtime(float(temp))
create_time = time.strftime("%Y-%m-%d %H:%M:%S",x) # get time now
create_time
creater_id=None
room_avatar=tag_path
is_pass=1
has_index=0
reason_id=None
#print father_id,name,friends_num,create_time,creater_id,room_avatar,is_pass,has_index,reason_id
######################有資格入庫的內容
counter=counter+1
curr.execute("INSERT INTO rooms(father_id,name,friends_num,description,create_time,creater_id,room_avatar,is_pass,has_index,reason_id)VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)",(father_id,name,friends_num,description,create_time,creater_id,room_avatar,is_pass,has_index,reason_id))
conn.commit() #必須時時進入資料庫,不然找不到父節點
if counter % 200==0:
print "current node",name,"num",counter
except Exception as e:
print "get content error",e
def work():
global queue
curr.execute(`select id,node,parent,name from classify where status=1`)
results = curr.fetchall()
for r in results:
top_id=r[0]
node=r[1]
parent=r[2]
name=r[3]
try:
queue.put((node,name,parent)) #首先放入佇列
while queue.qsize() >0:
n,p=queue.get() #頂節點出隊
getContent(n,p,top_id)
getChildren(n,name) #出隊內容的子節點
conn.commit()
except Exception as e:
print "what`s wrong",e
def new_work():
global queue
curr.execute(`select id,data_id,name from classify_new_copy where status=1`)
results = curr.fetchall()
for r in results:
top_id=r[0]
data_id=r[1]
name=r[2]
try:
get_topis(data_id,name,top_id)
except:
pass
def get_topis(data_id,name,top_id):
global queue
url = `https://www.zhihu.com/node/TopicsPlazzaListV2`
isGet = True;
offset = -20;
data_id=str(data_id)
while isGet:
offset = offset + 20
values = {`method`: `next`, `params`: `{"topic_id":`+data_id+`,"offset":`+str(offset)+`,"hash_id":""}`}
try:
msg=None
try:
data = urllib.urlencode(values)
request = urllib2.Request(url,data,headers)
response = urllib2.urlopen(request,None,5)
html=response.read().decode(`utf-8`)
json_str = json.loads(html)
ms=json_str[`msg`]
if len(ms) <5:
break
msg=ms[0]
except Exception as e:
print "eeeee",e
#print msg
if msg is not None:
soup = BeautifulSoup(str(msg))
blks = soup.find_all(`div`, {`class` : `blk`})
for blk in blks:
page=blk.find(`a`).get(`href`)
if page is not None:
node=page.replace("/topic/","") #將更多的種子入庫
parent=name
ne=blk.find(`strong`).text
try:
queue.put((node,ne,parent)) #首先放入佇列
while queue.qsize() >0:
n,name,p=queue.get() #頂節點出隊
size=queue.qsize()
if size > 0:
print size
getContent(n,name,p,top_id)
getChildren(n,name) #出隊內容的子節點
conn.commit()
except Exception as e:
print "what`s wrong",e
except urllib2.URLError, e:
print "error is",e
pass
if __name__ == `__main__`:
i=0
while i<400:
new_work()
i=i+1
說下資料庫的問題,我這裡就不傳附件了,看欄位自己建立,因為這確實太簡單了,我是用的mysql,你看自己的需求自己建。
有什麼不懂得麻煩去去轉盤網找我,因為這個也是我開發的,上面會及時更新qq群號,這裡不留qq號啥的,以免被系統給K了。