最近寫了一個爬取網易養生的一些文章,只爬取首頁,暫時沒有翻頁。後續有空再更新吧,下面是程式碼:
爬蟲程式碼
import requests
from lxml import etree
from config import db
import urllib.request
import re, os, uuid, time
def run():
data = parse()
insert(data)
# 解析頁面
def parse():
url = 'https://jiankang.163.com/special/yangsheng_n/?1'
res = requests.get(url)
if res.status_code == 200:
print("請求成功")
else:
print("請求失敗")
return
html = etree.HTML(res.content)
news_list = html.xpath('//div[@class="news_one havepic"]')
data = []
if news_list:
for news in news_list:
# 內容連結
content_url = news.xpath('./div[@class="news_main_info"]/h2/a/@href')[0]
# print(content_url)
content = source = ''
# 內容獲取
if content_url:
res_content = requests.get(content_url)
# res_content.encoding = 'ISO-8859-1'
if res_content.status_code == 200:
print("請求內容成功")
else:
print("請求內容失敗")
return
content_html = etree.HTML(res_content.content)
# 文章來源
article_source = content_html.xpath('//div[@class="post_main"]/div[@class="post_info"]/text()')[0] \
.strip().encode('iso-8859-1').decode('utf-8')
source_list = article_source.split(':')
source = source_list[-1].strip()
# return
# 文章內容
article_content = content_html.xpath('//div[@class="post_main"]/div[@class="post_content"]/div['
'@class="post_body"]/p')
# 內容合併
if article_content:
for p in article_content:
# 判斷內容是否有圖片
images = p.xpath('./img/@src')
if images is not None and len(images) > 0:
# pass
# 下載圖片
filePaths = download_img(images)
if filePaths is not None and len(filePaths) > 0:
for file in filePaths:
content += '<img src="' + file + '">'
# 內容獲取
text = p.xpath('./text() | ./a/text()')
# 大於1,則說明有a標籤
if len(text) > 1:
p_text = ''.join(text)
elif text is not None and len(text) == 1:
p_text = text[0]
else:
p_text = ''
content += p_text.strip().encode('iso-8859-1').decode('utf-8') + "\n"
source_img = news.xpath('./div[@class="news_main_info"]/a/img/@src')
local_img = download_img(source_img)[0]
# 文章資料
article = {
# 標題
'title': news.xpath('./div[@class="news_main_info"]/h2/a/text()')[0],
# 連結
'link': news.xpath('./div[@class="news_main_info"]/h2/a/@href')[0],
# 圖片
'source_img': source_img[0],
# 副標題
'sub_title': news.xpath('./div[@class="news_main_info"]/p/text()')[0].strip(),
# 標籤
'tags': news.xpath('./div[@class="news_sub_info"]/p[@class="keywords"]/a/text()'),
# 釋出時間
'publish_time': news.xpath('./div[@class="news_sub_info"]/p[@class="pubtime"]/text()')[0].strip(),
# 來源, 爬取網站的地址
'source': news.xpath('./div[@class="news_sub_info"]/div[@class="news_share"]/ul/@data-source')[0],
# 內容
'content': content,
# 真實來源, 爬取網站內容的源地址
'real_source': source,
# 本地地址
'local_img': local_img,
# 遠端地址
'oss_img': '',
}
data.append(article)
return data
# 插入文章和標籤
def insert(data):
DB = db.DB()
cur = DB.cursor()
tags_field = {'article_id', 'name', 'status'}
tags_value = data[0]['tags']
del data[0]['tags']
# 欄位
field = ', '.join('`{}`'.format(k) for k in data[0].keys())
# 值
values = ', '.join('%({})s'.format(k) for k in data[0].keys())
for j, i in enumerate(data):
# 先插入文章
# 判斷文章是否存在
sql_exists = "select id from dl_articles where title = '%s'" % (i.get('title'))
print(sql_exists)
cur.execute(sql_exists)
# 存在記錄,則直接跳過
if len(cur.fetchall()) > 0:
print("文章已存在")
continue
# return
# 插入文章記錄
insert_article_sql = "insert into %s (%s) values (%s)" % ('dl_articles', field, values)
try:
cur.execute(insert_article_sql, i)
DB.commit()
print("插入文章成功")
except Exception as e:
print("插入文章失敗")
print(str(e))
DB.rollback()
return
# 插入文章的id
article_id = cur.lastrowid
# 文章id小於1,則插入失敗
if article_id < 1:
continue
# 插入tags
if j == 0:
tags = tags_value
else:
tags = i.get('tags')
if len(tags) > 0:
for t in tags:
select_tags_sql = "select id from dl_tags where `article_id` = %s and `name` = '%s'" % (article_id, t)
print(select_tags_sql)
# return
try:
cur.execute(select_tags_sql)
tag_ids = cur.fetchall()
except Exception as e:
print("查詢標籤失敗")
print(str(e))
return
# 已存在文章對應的標籤
if len(tag_ids) > 0:
continue
# 插入標籤
insert_tags_sql = "insert into %s (`article_id`, `name`, `status`) values %s" % (
'dl_tags', (article_id, t, 0))
print(insert_tags_sql)
# return
try:
cur.execute(insert_tags_sql)
DB.commit()
print("插入標籤成功")
except Exception as e:
print("插入標籤失敗")
print(str(e))
DB.rollback()
# 下載圖片
def download_img(images):
data = []
ym = time.strftime("%Y%m", time.localtime())
d = time.strftime("%d", time.localtime())
path = './download/images/' + ym + '/' + d
if not os.path.exists(path):
os.makedirs(path)
suffix_list = ['jpg', 'png', 'jpeg']
suffix = 'jpg'
if type(images) == list and len(images) > 0:
for img in images:
print(img)
for s in suffix_list:
if re.search(s, img):
suffix = s
break
uid = str(uuid.uuid4())
filename = uid + '.' + suffix
filepath = path + '/' + filename
urllib.request.urlretrieve(img, filepath)
data.append(filepath)
return data
執行程式碼
> python .\main.py
請求成功
請求內容成功
http://cms-bucket.ws.126.net/2021/0929/0128808cj00r0647z000mc0009c0070c.jpg?imageView&thumbnail=150y100
請求內容成功
https://nimg.ws.126.net/?url=http%3A%2F%2Fcms-bucket.ws.126.net%2F2021%2F0928%2F6734e010j00r04eeu00kpc000k0032dc.jpg&thumbnail=660x2147483647&quality=80&type=jpg
http://cms-bucket.ws.126.net/2021/0928/9027cb39j00r04egc000qc0009c0070c.jpg?imageView&thumbnail=150y100
請求內容成功
http://cms-bucket.ws.126.net/2021/0926/b982d3aaj00r01tgi000nc0009c0070c.jpg?imageView&thumbnail=150y100
請求內容成功
http://cms-bucket.ws.126.net/2021/0925/a96809c3j00qzzq84000pc0009c0070c.jpg?imageView&thumbnail=150y100
請求內容成功
http://cms-bucket.ws.126.net/2021/0924/6cff9112j00qzx3np001ac0009c0070c.jpg?imageView&thumbnail=150y100
請求內容成功
http://cms-bucket.ws.126.net/2021/0923/7c8f7b5bj00qzvkee000sc0009c0070c.jpg?imageView&thumbnail=150y100
請求內容成功
http://cms-bucket.ws.126.net/2021/0923/86cdbf99j00qzv050000jc0009c0070c.jpg?imageView&thumbnail=150y100
請求內容成功
http://cms-bucket.ws.126.net/2021/0922/11309378j00qzt3ny002yc0009c0070c.jpg?imageView&thumbnail=150y100
請求內容成功
https://nimg.ws.126.net/?url=http%3A%2F%2Fcms-bucket.ws.126.net%2F2021%2F0917%2F02364e55j00qzl25900dec000k001vgc.jpg&thumbnail=660x2147483647&quality=80&type=jpg
http://cms-bucket.ws.126.net/2021/0917/b6c35977j00qzl26s0011c0009c0070c.jpg?imageView&thumbnail=150y100
請求內容成功
http://cms-bucket.ws.126.net/2021/0916/0679e91fp00qzj54e001rc0009c0070c.png?imageView&thumbnail=150y100
select id from dl_articles where title = '記好這些要訣,不吃安眠藥也能睡個好覺'
插入文章成功
select id from dl_tags where `article_id` = 1 and `name` = '安眠藥'
insert into dl_tags (`article_id`, `name`, `status`) values (1, '安眠藥', 0)
插入標籤成功
select id from dl_tags where `article_id` = 1 and `name` = '失眠'
insert into dl_tags (`article_id`, `name`, `status`) values (1, '失眠', 0)
插入標籤成功
...
圖片儲存
文章儲存
標籤儲存
本作品採用《CC 協議》,轉載必須註明作者和本文連結