python爬蟲抓取資料時失敗_python爬蟲 大佬 請教下 為什麼爬取的資料有時能爬到 有時有爬不到, 程式碼如下:...

weixin_39781143發表於2020-12-04

import time

import requests

from lxml import etree

import pymysql

class GuPiao_spider():

def __init__(self):

self.headers = {

'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:70.0) Gecko/20100101 Firefox/70.0'

}

self.url = 'http://q.10jqka.com.cn/index/index/board/all/field/zdf/order/desc/page/{}/ajax/1/' # 每頁的網址

# self.content_url = 'http://qd.10jqka.com.cn/quote.php?cate=real&type=stock&return=json&callback=showStockData&code=' # 動態網頁的網址

def get_pages(self):

'''

獲取網站總頁數

:return:

'''

url = 'http://q.10jqka.com.cn/index/index/board/all/field/zdf/order/desc/page/1/ajax/1/'

html = requests.get(url, headers=self.headers).content.decode('gbk')

html_path = etree.HTML(html)

page_num = html_path.xpath('//*[@id="m-page"]/span/text()')[0].split('/')[1]

return page_num

def get_content(self, url):

'''

:param url: 爬取每頁的原始碼,並獲取資料

:return: 返回所需資料的列表

'''

html = requests.get(url, headers=self.headers).content.decode('gbk')

html_path = etree.HTML(html)

tr = html_path.xpath('//tbody/tr')

content_list = []

for td in tr:

info = {}

title = td.xpath('td[3]/a/text()')[0]

code = td.xpath('td[2]/a/text()')[0]

li = td.xpath('td/text()')

info['序號'] = li[0]

info['程式碼'] = code

info['名稱'] = title

info['現價'] = li[1]

info['漲跌幅'] = li[2]

info['漲跌'] = li[3]

info['漲速'] = li[4]

info['換手'] = li[5]

info['量比'] = li[6]

info['振幅'] = li[7]

info['成交額'] = li[8]

info['流通股'] = li[9]

info['流動市值'] = li[10]

info['市贏率'] = li[11]

content_list.append(info)

return content_list

# def save_mysql(self,sql, data):

# '''

# 連線mysql資料庫,把資料存到資料庫中

# :return:

# '''

# conn = pymysql.connect(

# host = 'localhost',

# user = 'root',

# password = '123456',

# port = 3306,

# db = 'test'

# )

# cur = conn.cursor()

# cur.execute(sql,data)

# conn.commit()

def run(self):

sql = 'insert into tonghuashun(序號,程式碼,名稱,現價,漲跌幅,漲跌,漲速,換手,量比,振幅,成交額,流通股,流動市值,市贏率) values (%(序號)s,%(程式碼)s,%(名稱)s,%(現價)s,' \

'%(漲跌幅)s,%(漲跌)s,%(漲速)s,%(換手)s,' \

'%(量比)s,%(振幅)s,%(成交額)s,%(流通股)s,' \

'%(流動市值)s,%(市盈率)s)'

page_num = self.get_pages() # 網站總頁數

count = 2 # 用計數器表示爬取的第幾頁 帶入self.url中

while True:

print('正在爬取第{}頁資料...............'.format(count))

url = self.url.format(str(count))

# print(url)

content_list = self.get_content(url)

for data in content_list: # 迴圈列表 得到字典資料

print(data)

# self.save_mysql(sql, data)

count += 1

time.sleep(3)

if count >= int(page_num):

return False

if name == 'main':

spider = GuPiao_spider()

spider.run()

相關文章