用 BeautifulSoup 爬資料

xushaobo發表於2018-01-05

sudo apt-get install python3-bs4 pip install BeautifulSoup

import urllib.request

from bs4 import BeautifulSoup
import re
from math import ceil
import time

def qiyeinfo(picurl):
time.sleep(1)
info = {}
qiyeid = picurl.split('/')[-2]
picurl = picurl + 'company_detail.html'
useragent = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
headers = {'User-Agent': useragent}
req = urllib.request.Request(picurl, headers=headers)
html1 = urllib.request.urlopen(req, timeout=5)
bsObj = BeautifulSoup(html1, 'html.parser', from_encoding='gb18030')
html1.close()
try:
qiyeinfo = bsObj.find('div', {'class': 'data'})
tel = bsObj.find('div', {'class': 'telephone'}).get_text()
qiyename = qiyeinfo.p.get_text()
contactsname = bsObj.findAll('div', {'class': 'l-content'})[1].a.get_text()
with open(r'F:\test.txt', 'a+') as f:
f.write('企業url: ' + picurl + '\n')
f.write('企業名稱:' + qiyename + '\n')
f.write('聯絡人:' + contactsname + '\n')
f.write('手機: ' + tel + '\n')
for i in qiyeinfo.find('ul').findAll('li'):
f.write(i.get_text() + '\n')
f.write('\n')
except:
pass

def qiyelist(picurl):
useragent = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
headers = {'User-Agent': useragent}
req = urllib.request.Request(picurl, headers=headers)
html = urllib.request.urlopen(req, timeout=10)
bsObj = BeautifulSoup(html, 'html.parser', from_encoding='gb18030')
html.close()
listnum = bsObj.find('div', {'class': 'tit tit2'}).em.get_text()
a = int(listnum) / len(bsObj.findAll('h4'))
for i in range(15, 25):
listurl = '%s/pn%s' % (picurl, i)
req = urllib.request.Request(listurl, headers=headers)
html = urllib.request.urlopen(req, timeout=5)
bsObj = BeautifulSoup(html, 'html.parser', from_encoding='gb18030')
html.close()
for i in bsObj.findAll('h4'):
qiyeurl = i.a.attrs['href']
qiyeinfo(qiyeurl)

if name == 'main':
qiyelist('http://b2b.huangye88.com/jiangxi/food')

相關文章