一直想整個爬蟲玩玩,之前用Java試過...的確是術業有專攻啊,Python寫起爬蟲來更加方便
今天的成果:
main檔案
主要的方法都封裝在了spider-cnblogs裡了,這裡主要傳遞一個url,待會程式碼貼在後邊
spider-cnblogs
大致的思路是這樣的,先用requests傳送請求,然後使用BeautifulSoup進行html解析,(推薦使用CSS選擇器的方式獲取想要的內容),解析完成後持久化到資料庫,這裡使用了阿里雲的ECS,裡面安裝了一個MySQL。
程式碼
main.py
from black_fish.cnblogs.spider_cnblogs import Cnblogs
if __name__ == '__main__':
# index,48,候選
Cnblogs.executeSpider("https://www.cnblogs.com")
Cnblogs.executeSpider("https://www.cnblogs.com/aggsite/topviews")
Cnblogs.executeSpider("https://www.cnblogs.com/candidate/")
spider-cnblogs
import requests
from bs4 import BeautifulSoup
import pymysql
class Cnblogs:
def __init__(self, id, title, href, date, star_num, comment_num, view_num):
self.id = id
self.title = title
self.href = href
self.date = date
self.star_num = star_num
self.view_num = view_num
self.comment_num = comment_num
def print(self):
print(self.id, self.title, self.href, self.date, self.star_num, self.comment_num, self.view_num)
def executeSpider(cnblogs_url):
response = requests.get(cnblogs_url);
bs = BeautifulSoup(response.text);
# 獲取標題&連結
mainItems = bs.select(".post-item-title");
# 獲取釋出日期,點贊數,評論數,瀏覽量
timeItems = bs.select(".post-item-foot>.post-meta-item span");
t_list = []
for t_index, timeItem in enumerate(timeItems):
t_list.append(timeItem.string)
db = pymysql.connect("47.103.6.247", "username", "password", "black_fish_db")
cursor = db.cursor()
sql = "insert into cnblogs(title, href, date, star_num, comment_num, view_num) value(%s,%s,%s,%s,%s,%s)"
for m_index, main_item in enumerate(mainItems):
cnblog = Cnblogs(0, main_item.string, main_item.attrs['href'],
t_list[m_index * 4], int(t_list[m_index * 4 + 1]), int(t_list[m_index * 4 + 2]),
int(t_list[m_index * 4 + 3]))
val = (cnblog.title, cnblog.href, cnblog.date, cnblog.star_num, cnblog.comment_num, cnblog.view_num)
print(val)
cursor.execute(sql, val)
db.commit()
db.close()