一、效果演示
二、涉及模組
- python3.7
- wxPython
- re
- pymysql
- BeautifulSoup
- time
- urllib
- webbrowser
三、功能說明
四、程式碼實現
1、入口/啟動程式碼
import wx
from MyMenu import MyMenu
class MyApp(wx.App):
def OnInit(self):
frame = MyMenu(None, -1, '我的CSDN部落格資料查詢工具')
frame.Show(True)
return True
app = MyApp(0)
app.MainLoop()
2、GUI介面開發程式碼
import pymysql
import wx
import webbrowser as brower
from Crawl import Crawl
class MyMenu(wx.Frame):
def __init__(self, parent, id, title):
super(MyMenu, self).__init__(parent, id, title)
self.SetSize(wx.Size(600, 400))
self.SetIcon(wx.Icon('ico/crawl.ico', wx.BITMAP_TYPE_ICO))
self.Centre()
self.qt = None
self.link = None
panel = wx.Panel(self, -1)
w = self.GetSize().width
h = self.GetSize().height
label_url = wx.StaticText(panel, -1, "[我的部落格]地址:", pos=((w - 300)/2, (h - 300)/2))
label_dbname = wx.StaticText(panel, -1, "本地資料庫名:", pos=((w - 300)/2, (h - 300)/2 + 50))
label_user = wx.StaticText(panel, -1, "資料庫使用者:", pos=((w - 300)/2, (h - 300)/2 + 100))
label_pwd = wx.StaticText(panel, -1, "資料庫密碼:", pos=((w - 300)/2, (h - 300)/2 + 150))
self.entry_url = wx.TextCtrl(panel, -1, size=(200, 30), pos=((w - 300)/2 + 100, (h - 300)/2))
self.entry_dbname = wx.TextCtrl(panel, -1, size=(200, 30), pos=((w - 300)/2 + 100, (h - 300)/2 + 50))
self.entry_user = wx.TextCtrl(panel, -1, size=(200, 30), pos=((w - 300) / 2 + 100, (h - 300) / 2 + 100))
self.entry_pwd = wx.TextCtrl(panel, -1, size=(200, 30), pos=((w - 300) / 2 + 100, (h - 300) / 2 + 150))
left, top = self.entry_pwd.Position
self.but_test_connect = wx.Button(panel, -1, "測試連線", size=(100, 30), pos=(left, top + 50))
self.but_search = wx.Button(panel, -1, "查詢", size=(80, 30), pos=(left + 120, top + 50))
self.but_test_connect.SetBackgroundColour("#009ad6")
self.but_search.SetBackgroundColour("#1d953f")
self.Bind(wx.EVT_BUTTON, self.on_test_connect, self.but_test_connect)
self.Bind(wx.EVT_BUTTON, self.on_search, self.but_search)
self.url = ''
self.db_name = ''
self.db_user = ''
self.user_pwd = ''
self.init_ui()
"""
建立選單欄
"""
def init_ui(self):
menu = wx.MenuBar()
'''設定選單'''
file = wx.Menu()
self.qt = wx.MenuItem(file, 103, '&Quit\tCtrl+Q', 'Quit the Application')
self.qt.SetBitmap(wx.Bitmap('ico/quit.ico', wx.BITMAP_TYPE_ICO))
file.Append(self.qt)
help = wx.Menu()
self.link = wx.MenuItem(help, 201, '&About', 'About Introduction')
help.Append(self.link)
menu.Append(file, '&File')
menu.Append(help, '&Help')
self.SetMenuBar(menu)
self.Bind(wx.EVT_MENU, self.on_quit, self.qt)
self.Bind(wx.EVT_MENU, self.on_link, self.link)
self.Show()
'''
定義一個訊息彈出框的函式
'''
def alert_error(self, word=""):
dlg = wx.MessageDialog(None, word, u"警告", wx.YES_NO)
if dlg.ShowModal() == wx.ID_YES:
pass
dlg.Destroy()
'''
定義一個訊息彈出框的函式
'''
def alert_success(self, word=""):
dlg = wx.MessageDialog(None, word, u"提示", wx.YES_NO)
if dlg.ShowModal() == wx.ID_YES:
pass
dlg.Destroy()
'''
測試連線方法
'''
def on_test_connect(self, event, is_alert=True):
self.db_name = self.entry_dbname.GetValue()
self.db_user = self.entry_user.GetValue()
self.user_pwd = self.entry_pwd.GetValue()
if self.db_name and self.db_user and self.user_pwd:
try:
conn = pymysql.connect(host="127.0.0.1"
, user=self.db_user
, password=self.user_pwd
, db=self.db_name
, port=3306)
if is_alert:
self.alert_success('連線成功!')
return True
except Exception as e:
print(e)
self.alert_error('連線失敗!')
return False
finally:
conn.close()
else:
self.alert_error(word='存在未填寫資訊!')
return False
'''
查詢方法
'''
def on_search(self, event):
self.url = self.entry_url.GetValue()
if self.url == '':
self.alert_error('URL不能為空!')
return False
flag = self.on_test_connect(event, is_alert=False)
if flag:
is_success = Crawl(url=self.url
, db_name=self.db_name
, user=self.db_user
, pwd=self.user_pwd)
if is_success:
self.alert_success('查詢並儲存本地資料庫成功,可前往檢視呦')
else:
self.alert_error('查詢出錯了!')
'''
退出視窗
'''
def on_quit(self, event):
self.Close()
'''
幫助方法
'''
def on_link(self, event):
brower.open('https://blog.csdn.net/qq_19314763/article/details/110951204')
3、爬取部落格資訊程式碼
import urllib.request as request
import urllib.error as error
import time
from bs4 import BeautifulSoup
import pymysql
import re
class Crawl:
def __init__(self, url, db_name, user, pwd):
self.url = url
self.db_name = db_name
self.db_user = user
self.user_pwd = pwd
self.main()
"""
爬取整個HTML頁面
"""
def download(self, crawl_url, num_retries=3):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/72.0.3626.121 Safari/537.36',
}
req = request.Request(crawl_url, headers=headers)
try:
crawl_html = request.urlopen(req).read().decode("utf-8")
except error.URLError as e:
print("download error:", e.reason)
crawl_html = None
if num_retries > 0:
if hasattr(e, "code") and 500 <= e.code <= 600:
time.sleep(5000)
return self.download(crawl_url, num_retries-1)
return crawl_html
"""
解析HTML獲取頁面上的10組資料
"""
def parse_page(self, page_html):
data_dict = {'積分': '', '粉絲': '', '獲贊': '', '評論': '', '收藏': ''
, '原創': '', '周排名': '', '總排名': '', '訪問': '', '等級': ''
, '賬號': '', '暱稱': ''}
soup = BeautifulSoup(page_html, "html.parser")
dl_list = soup.find_all('dl', class_='text-center')
for dl in dl_list:
dd_name = dl.select('dd')[0].text
dd_title = dl.get('title')
for k in data_dict.keys():
if dd_name == k:
data_dict[k] = dd_title
self_info_list = soup.find_all('a', id='uid')
alias = self_info_list[0].get('title')
account = self_info_list[0].select('span')[0].get('username')
data_dict.update({"賬號": account, "暱稱": alias})
data_dict.update({"爬取時間": time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())})
return data_dict
"""
判斷表是否存在
"""
def table_exists(con, table_name):
sql = "show tables;"
con.execute(sql)
tables = [con.fetchall()]
table_list = re.findall('(\'.*?\')',str(tables))
table_list = [re.sub("'", '', each) for each in table_list]
if table_name in table_list:
return 1
else:
return 0
'''
將字典資料存入資料庫
'''
def save_to_mysql(self, data_obj):
conn = pymysql.connect(
host="127.0.0.1",
port=3306,
user=self.db_user,
password=self.user_pwd,
database=self.db_name
)
cursor = conn.cursor()
sql_create = """CREATE TABLE IF NOT EXISTS `csdn_self_blog_data` (
`id` bigint NOT NULL AUTO_INCREMENT,
`account` varchar(100) DEFAULT NULL,
`alias` varchar(100) DEFAULT NULL,
`grade` int DEFAULT NULL,
`count_fan` int DEFAULT NULL,
`count_thumb` bigint DEFAULT NULL,
`count_comment` bigint DEFAULT NULL,
`count_star` int DEFAULT NULL,
`count_original` int DEFAULT NULL,
`rank_week` bigint DEFAULT NULL,
`rank_all` bigint DEFAULT NULL,
`count_scan` bigint DEFAULT NULL,
`blog_level` varchar(100) DEFAULT NULL,
`crawl_time` datetime DEFAULT NULL,
`start_hour` int DEFAULT NULL,
`end_hour` int DEFAULT NULL,
`crawl_date` date DEFAULT NULL,
PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8
"""
cursor.execute(sql_create)
curr_date = time.strftime("%Y-%m-%d", time.localtime())
curr_time = data_obj['爬取時間']
sql_insert = """insert into csdn_self_blog_data(
account, alias, grade, count_fan, count_thumb
,count_comment, count_star, count_original, rank_week, rank_all
,count_scan, blog_level, crawl_time, start_hour, end_hour
,crawl_date)
values( %s, %s, %s, %s, %s
,%s, %s, %s, %s, %s
,%s, %s, %s, %s, %s
,%s)"""
values_list = [data_obj['賬號'], data_obj['暱稱'], data_obj['積分'], data_obj['粉絲'], data_obj['獲贊']
, data_obj['評論'], data_obj['收藏'], data_obj['原創'], data_obj['周排名'], data_obj['總排名']
, data_obj['訪問'], data_obj['等級'], curr_time, curr_time[11:13], int(curr_time[11:13]) + 1
, curr_date]
cursor.execute(sql_insert, tuple(values_list))
conn.commit()
conn.close()
print("======================儲存資料庫成功!=======================")
return True
"""
主程式
"""
def main(self):
html = self.download(self.url)
dict_obj = self.parse_page(html)
self.save_to_mysql(dict_obj)
4、完整程式碼
碼雲Gitee地址
五、參考連結