爬蟲:拉勾自動投遞簡歷+資料獲取
相對於boss直聘,拉勾沒有那個訪問時間長就提示驗證。
from selenium import webdriver
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from bs4 import BeautifulSoup
import redis
import json
import random
import traceback
import re
def login():
driver.get("https://passport.lagou.com/login/login.html")
time.sleep(1)
driver.find_element_by_css_selector('input[type="text"]').send_keys(USERNAME)
driver.find_element_by_css_selector('input[type="password"]').send_keys(PASSWORD)
driver.find_element_by_css_selector('input[type="submit"]').click()
wait = WebDriverWait(driver, 10 * 60)
wait.until(EC.presence_of_element_located((By.ID, 'search_input')))
time.sleep(3)
def sleep():
time.sleep(random.choice(range(6, 12)))
def deal(info, key, func, soup):
try:
if isinstance(key, list):
for k, v in zip(key, func(soup)):
info[k] = v
else:
info[key] = func(soup)
except Exception as e:
exstr = traceback.format_exc()
print(exstr)
def save_info(base_name, key, data):
r.hset(f"{base_name}_hash", data[key], json.dumps(data))
r.sadd(f"{base_name}_set", data[key])
r.sadd(f"{base_name}_set_bak", data[key])
def get_start_info(page, city):
soup = BeautifulSoup(page, 'lxml')
jobs = soup.find('div', class_='s_position_list').find('ul', class_='item_con_list').findAll('li')
for job in jobs:
try:
job_info, company_info = {'job_city': city}, {}
for k, v in (('job_id', 'data-positionid'), ('salary', 'data-salary'), ('company_name', 'data-company'),
('job_name', 'data-positionname'), ('company_id', 'data-companyid'), ('hr_id', 'data-hrid')):
deal(job_info, k, lambda x: x[v], job)
deal(job_info, 'job_location', lambda x: x.find('span', class_='add').text, job)
deal(job_info, 'update_date', lambda x: x.find('span', class_='format-time').text, job)
for key in ['hr_name', 'hr_position', 'hr_portrait']:
deal(job_info, key, lambda x: x.find('input', class_=key)['value'], job)
deal(job_info, ['experience', 'education'],
lambda x: x.find('div', class_='li_b_l').text.strip().split('\n')[1].split('/'), job)
deal(job_info, 'job_tags',
lambda x: [i for i in x.find('div', class_='list_item_bot').text.strip().split('\n') if i != ''], job)
company = job.find('div', class_='company')
deal(company_info, ['company_industry', 'company_financial', 'company_numbers'],
lambda x: x.find('div', class_='industry').text.strip().split('/'), company)
deal(company_info, 'company_href', lambda x: x.find('a')['href'], company)
deal(company_info, 'company_logo', lambda x: x.find('div', class_='com_logo').find('img')['src'], job)
for k in ['company_id', 'company_name']:
if k in job_info:
company_info[k] = job_info[k]
save_info('lagou_jobs', 'job_id', job_info)
save_info('lagou_companies', 'company_id', company_info)
except Exception as e:
exstr = traceback.format_exc()
print(exstr)
def get_start_page(url, city):
driver.get(url)
try:
driver.find_element_by_css_selector('div[class="body-btn"]').click()
except:
pass
index = 1
while True:
print(f'---------------------- page:{index} ------------------------------')
page = driver.page_source
get_start_info(page, city)
try:
target = driver.find_element_by_css_selector('span[class="pager_next "]')
driver.execute_script("arguments[0].scrollIntoView();", target)
target.click()
index += 1
sleep()
except Exception as e:
break
def get_job_info(page, info):
open('job.html', 'w').write(page)
soup = BeautifulSoup(page, 'lxml')
try:
deal(info, ['city', 'experience', 'education'], lambda x: [re.sub('[/ ]', '', i) for i in x.
find('dd', class_='job_request').find('h3').text.strip().split('\n')[1:-1]], soup)
deal(info, 'position_labels', lambda x: x.find('ul', class_='position-label clearfix').text.strip().split('\n'),
soup)
deal(info, 'job_advantage', lambda x: x.find('dd', class_='job-advantage').text.strip().split('\n'),
soup)
deal(info, 'job_describe', lambda x: x.find('dd', class_='job_bt').text.strip().split('\n'),
soup)
deal(info, 'job_address', lambda x: ''.join([i.strip() for i in x.find('dd', class_='job-address clearfix').
find('div', class_='work_addr').text.strip().split('\n')[:-1]]),
soup)
deal(info, ['position_lng', 'position_lat'], lambda x: [i['value'] for i in soup.
find('dd', class_='job-address clearfix').findAll('input')[:2]], soup)
for k in ['hr_name', 'hr_position', 'hr_portrait', 'target_hr']:
deal(info, k, lambda x: x.find('dd', class_='jd_publisher').find('input', class_=k)['value'], soup)
deal(info, 'job_describe', lambda x: x.find('span', class_='time').text, soup)
jobs_similar = soup.find('div', class_='jobs_similar').find('ul', class_='similar_list reset').findAll('li')
save_info('lagou_jobs', 'job_id', info)
company_info = {}
company = soup.find('dl', class_='job_company')
deal(company_info, 'company_href', lambda x: x.find('a')['href'], company)
deal(company_info, 'company_logo', lambda x: x.find('img')['src'], company)
deal(company_info, 'company_name', lambda x: x.find('em').text.strip(), company)
deal(company_info, ['company_industry', 'company_financial', 'company_numbers'],
lambda x: [i.text for i in x.find('ul', class_='c_feature').findAll('h4')[:3]], company)
deal(company_info, 'company_web', lambda x: x.find('ul', class_='c_feature').find('a')['href'], company)
save_info('lagou_companies', 'company_name', company_info)
for job in jobs_similar:
job_info = {}
deal(job_info, 'job_id', lambda x: x.find('a')['data-lg-tj-cid'], job)
deal(job_info, 'job_name', lambda x: x.find('h2').text, job)
deal(job_info, 'salary', lambda x: x.find('p').text, job)
deal(job_info, ['company_name', 'job_location'],
lambda x: x.find('p', class_='similar_company_name')['title'].split(' '), job)
save_info('lagou_jobs', 'job_id', job_info)
salary_range = [i for i in re.split('[-Kk·薪]', info['salary']) if i != '']
return int(salary_range[0]) >= 12 and 40 > int(salary_range[1]) >= 17
except Exception as e:
exstr = traceback.format_exc()
print(exstr)
return False
def send_resume(url, info):
driver.get(url)
time.sleep(1)
page = driver.page_source
try:
if get_job_info(page, info):
driver.find_element_by_partial_link_text("投個簡歷").click()
time.sleep(1)
driver.find_element_by_partial_link_text("確認投遞").click()
except Exception as e:
pass
def run(mode, is_login=True):
if is_login:
login()
if mode == 'start':
for num, city in zip(cities_num, cities):
url = f"{base_url}/jobs/list_%E6%95%B0%E6%8D%AE/p-city_{num}?px=default#filterBox"
get_start_page(url, city)
sleep()
elif mode == 'send':
while r.scard('lagou_jobs_set') != 0:
job_id = r.srandmember("lagou_jobs_set")
info = json.loads(r.hget('lagou_jobs_hash', job_id))
print(f'------------- 公司:{info["company_name"]} --- 工作:{info["job_name"]} ------------------------------')
send_resume(f"{base_url}/jobs/{info['job_id']}.html", info)
r.srem('lagou_jobs_set', job_id)
time.sleep(random.choice([10, 15]))
if __name__ == '__main__':
r = redis.Redis(password='123zxc')
cities = ['北京', '上海', '杭州', '天津', '蘇州', '南京', '大連', '瀋陽']
cities_num = [2, 3, 6, 4, 80, 79, 43, 44]
USERNAME = 'xxxx'
PASSWORD = 'xxxx'
base_url = 'https://www.lagou.com'
driver = webdriver.Chrome("/mnt/2D97AD940A9AD661/python_project/boss/chromedriver")
driver.maximize_window()
run('send')
driver.quit()
相關文章
- Python爬蟲精簡步驟1 獲取資料Python爬蟲
- 拉勾網職位資料爬取
- JavaScript爬蟲程式實現自動化爬取tiktok資料教程JavaScript爬蟲
- 如何高效獲取大資料?動態ip代理:用爬蟲!大資料爬蟲
- python爬蟲簡歷專案怎麼寫_爬蟲專案咋寫,爬取什麼樣的資料可以作為專案寫在簡歷上?...Python爬蟲
- Python爬蟲遞迴呼叫爬取動漫美女圖片Python爬蟲遞迴
- python爬蟲獲取天氣網實時資料Python爬蟲
- python爬蟲-1w+套個人簡歷模板爬取Python爬蟲
- API商品資料介面呼叫實戰:爬蟲與資料獲取API爬蟲
- 用Python網路爬蟲獲取Mikan動漫資源Python爬蟲
- 網路爬蟲如何獲取IP進行資料抓取爬蟲
- 最簡單的網路圖片的爬取 --Pyhon網路爬蟲與資訊獲取爬蟲
- Laravel 手動搭建簡單的資料爬蟲Laravel爬蟲
- 獲取爬蟲動態IP的三種方法爬蟲
- 爬蟲爬取資料如何繞開限制?爬蟲
- web自動化簡單使用程式碼,獲取集合資料Web
- 爬蟲實戰:從HTTP請求獲取資料解析社群爬蟲HTTP
- 爬取Arcconf文件教程並自動生成Gitbook文件顯示 - 資料獲Git
- 房產資料爬取、智慧財產權資料爬取、企業工商資料爬取、抖音直播間資料python爬蟲爬取Python爬蟲
- 爬蟲實踐之獲取網易雲評論資料資訊爬蟲
- 歷史股票資料的爬取
- 爬蟲實戰——58同城租房資料爬取爬蟲
- 如何保障爬蟲高效穩定爬取資料?爬蟲
- python爬蟲如何獲取表情包Python爬蟲
- python爬取前程無憂和拉勾資料分析崗位並分析Python
- python 獲取excel資料 自動登陸PythonExcel
- Springboot+JPA下實現簡易爬蟲--爬取豆瓣電視劇資料Spring Boot爬蟲
- 求職簡歷-Python爬蟲工程師求職Python爬蟲工程師
- Golang 爬蟲快速入門 | 獲取 B 站全站的視訊資料Golang爬蟲
- 實時獲取股票資料,免費!——Python爬蟲Sina Stock實戰Python爬蟲
- 遞迴遍歷物件獲取value值遞迴物件
- go語言實現簡單爬蟲獲取頁面圖片Go爬蟲
- Golang+chromedp+goquery 簡單爬取動態資料GolangChrome
- 爬蟲例項-淘寶頁面商品資訊獲取爬蟲
- Python 爬蟲獲取網易雲音樂歌手資訊Python爬蟲
- python爬蟲,獲取中國工程院院士資訊Python爬蟲
- 利用Python爬蟲獲取招聘網站職位資訊Python爬蟲網站
- Python爬蟲框架:scrapy爬取高考派大學資料Python爬蟲框架