京東商品資訊爬蟲

weixin_33976072發表於2017-08-14

最近閒著在家無聊,就看看爬蟲的書籍,突然發現很有趣,就寫了許多程式碼,爬取了許多的網站,今天就分享爬取京東的原始碼。

#京東商品資訊爬蟲  
#爬取京東商品資訊並儲存到csv格式檔案中  
#2017-7-23  
  
  
import os  
import requests  
import csv  
from bs4 import BeautifulSoup  
  
#獲取url請求  
def gethtml(kind,page):  
    '''''獲取url請求'''  
    pagenum = str(2 * page)  
    try:  
        r = requests.get('https://search.jd.com/Search?keyword=' + \  
        kind + '&enc=utf-8&page=' + pagenum)#連結url  
        r.raise_for_status()  
        r.encoding = r.apparent_encoding#轉碼  
        print('爬取第{}頁:'.format(page))  
        return r.text#返回html  
    except:  
        print('連結異常!!!')  
        return ''  
  
#獲取定位資源  
def findhtml(html,httplist):  
    """尋找資源"""  
    soup = BeautifulSoup(html,'lxml')  
    links = soup.find_all('div', class_='gl-i-wrap')#尋找'div'標籤  
    for link in links:  
        ui = []  
        namediv = link.find('div', class_='p-name p-name-type-2')#尋找商品名稱和連結  
        title = namediv.a['title']  
        href = namediv.a['href']  
        ui.append(title)#名稱加入到ui中  
        pricediv = link.find('div', class_='p-price')#尋找商品價格  
        try:  
            price =  pricediv.strong['data-price']   
            ui.append(price)#價格加入到ui中  
        except:  
            ui.append('')  
        if 'https:' not in href:#新增連結  
            ui.append('https:' + href)  
        else:  
            ui.append(href)  
        aggressmentdiv = link.find('div', class_='p-commit')#尋找評論  
        number = aggressmentdiv.strong.contents[1].string  
        ui.append(number)#評論數新增到ui中  
        httplist.append(ui)  
        try:  
            if price:  
                print('{:^10s}:{:<}元'.format(title,price))  
            else:  
                print('{:^10s}'.format(title))  
        except:  
            print('{:^10s}'.format(title))  
  
  
#儲存資源  
def savehtml(ul):  
    path = 'D:/資料/'  
    if not os.path.exists(path):  
        os.mkdir(path)#建立一個檔案  
    with open(path + '京東商品資訊爬蟲.csv','w+') as f:  
        writer = csv.writer(f)  
        writer.writerow(['商品','價格','連結','評價數'])  
        for u in range(len(ul)):  
            if ul[u]:  
                writer.writerow([ul[u][0],ul[u][1],ul[u][2],ul[u][3]])  
  
  
  
#程式主體  
if __name__ == '__main__':  
    goods = input('請輸入要搜尋的物品:')  
    yeshu = int(input('請輸入要查詢到的頁數:'))  
    ulist = []  
    for i in range(yeshu+1):  
        try:  
            if i != 0:  
                text = gethtml(goods,i)  
                findhtml(text,ulist)  
            savehtml(ulist)  
        except:  
            break  

相關文章