爬蟲:拉勾自動投遞簡歷+資料獲取

泛泛之素發表於2020-10-21

相對於boss直聘,拉勾沒有那個訪問時間長就提示驗證。

from selenium import webdriver
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from bs4 import BeautifulSoup
import redis
import json
import random
import traceback
import re


def login():
    driver.get("https://passport.lagou.com/login/login.html")
    time.sleep(1)
    driver.find_element_by_css_selector('input[type="text"]').send_keys(USERNAME)
    driver.find_element_by_css_selector('input[type="password"]').send_keys(PASSWORD)
    driver.find_element_by_css_selector('input[type="submit"]').click()
    wait = WebDriverWait(driver, 10 * 60)
    wait.until(EC.presence_of_element_located((By.ID, 'search_input')))
    time.sleep(3)


def sleep():
    time.sleep(random.choice(range(6, 12)))


def deal(info, key, func, soup):
    try:
        if isinstance(key, list):
            for k, v in zip(key, func(soup)):
                info[k] = v
        else:
            info[key] = func(soup)
    except Exception as e:
        exstr = traceback.format_exc()
        print(exstr)


def save_info(base_name, key, data):
    r.hset(f"{base_name}_hash", data[key], json.dumps(data))
    r.sadd(f"{base_name}_set", data[key])
    r.sadd(f"{base_name}_set_bak", data[key])


def get_start_info(page, city):
    soup = BeautifulSoup(page, 'lxml')
    jobs = soup.find('div', class_='s_position_list').find('ul', class_='item_con_list').findAll('li')
    for job in jobs:
        try:
            job_info, company_info = {'job_city': city}, {}
            for k, v in (('job_id', 'data-positionid'), ('salary', 'data-salary'), ('company_name', 'data-company'),
                         ('job_name', 'data-positionname'), ('company_id', 'data-companyid'), ('hr_id', 'data-hrid')):
                deal(job_info, k, lambda x: x[v], job)

            deal(job_info, 'job_location', lambda x: x.find('span', class_='add').text, job)
            deal(job_info, 'update_date', lambda x: x.find('span', class_='format-time').text, job)
            for key in ['hr_name', 'hr_position', 'hr_portrait']:
                deal(job_info, key, lambda x: x.find('input', class_=key)['value'], job)
            deal(job_info, ['experience', 'education'],
                 lambda x: x.find('div', class_='li_b_l').text.strip().split('\n')[1].split('/'), job)
            deal(job_info, 'job_tags',
                 lambda x: [i for i in x.find('div', class_='list_item_bot').text.strip().split('\n') if i != ''], job)

            company = job.find('div', class_='company')
            deal(company_info, ['company_industry', 'company_financial', 'company_numbers'],
                 lambda x: x.find('div', class_='industry').text.strip().split('/'), company)
            deal(company_info, 'company_href', lambda x: x.find('a')['href'], company)
            deal(company_info, 'company_logo', lambda x: x.find('div', class_='com_logo').find('img')['src'], job)
            for k in ['company_id', 'company_name']:
                if k in job_info:
                    company_info[k] = job_info[k]
            save_info('lagou_jobs', 'job_id', job_info)
            save_info('lagou_companies', 'company_id', company_info)
        except Exception as e:
            exstr = traceback.format_exc()
            print(exstr)


def get_start_page(url, city):
    driver.get(url)
    try:
        driver.find_element_by_css_selector('div[class="body-btn"]').click()
    except:
        pass

    index = 1
    while True:
        print(f'---------------------- page:{index}  ------------------------------')
        page = driver.page_source
        get_start_info(page, city)
        try:
            target = driver.find_element_by_css_selector('span[class="pager_next "]')
            driver.execute_script("arguments[0].scrollIntoView();", target)
            target.click()
            index += 1
            sleep()
        except Exception as e:
            break


def get_job_info(page, info):
    open('job.html', 'w').write(page)
    soup = BeautifulSoup(page, 'lxml')
    try:
        deal(info, ['city', 'experience', 'education'], lambda x: [re.sub('[/ ]', '', i) for i in x.
             find('dd', class_='job_request').find('h3').text.strip().split('\n')[1:-1]], soup)
        deal(info, 'position_labels', lambda x: x.find('ul', class_='position-label clearfix').text.strip().split('\n'),
             soup)
        deal(info, 'job_advantage', lambda x: x.find('dd', class_='job-advantage').text.strip().split('\n'),
             soup)
        deal(info, 'job_describe', lambda x: x.find('dd', class_='job_bt').text.strip().split('\n'),
             soup)
        deal(info, 'job_address', lambda x: ''.join([i.strip() for i in x.find('dd', class_='job-address clearfix').
                                                    find('div', class_='work_addr').text.strip().split('\n')[:-1]]),
             soup)

        deal(info, ['position_lng', 'position_lat'], lambda x: [i['value'] for i in soup.
             find('dd', class_='job-address clearfix').findAll('input')[:2]], soup)

        for k in ['hr_name', 'hr_position', 'hr_portrait', 'target_hr']:
            deal(info, k, lambda x: x.find('dd', class_='jd_publisher').find('input', class_=k)['value'], soup)
        deal(info, 'job_describe', lambda x: x.find('span', class_='time').text, soup)
        jobs_similar = soup.find('div', class_='jobs_similar').find('ul', class_='similar_list reset').findAll('li')
        save_info('lagou_jobs', 'job_id', info)

        company_info = {}
        company = soup.find('dl', class_='job_company')
        deal(company_info, 'company_href', lambda x: x.find('a')['href'], company)
        deal(company_info, 'company_logo', lambda x: x.find('img')['src'], company)
        deal(company_info, 'company_name', lambda x: x.find('em').text.strip(), company)
        deal(company_info, ['company_industry', 'company_financial', 'company_numbers'],
             lambda x: [i.text for i in x.find('ul', class_='c_feature').findAll('h4')[:3]], company)
        deal(company_info, 'company_web', lambda x: x.find('ul', class_='c_feature').find('a')['href'], company)
        save_info('lagou_companies', 'company_name', company_info)

        for job in jobs_similar:
            job_info = {}
            deal(job_info, 'job_id', lambda x: x.find('a')['data-lg-tj-cid'], job)
            deal(job_info, 'job_name', lambda x: x.find('h2').text, job)
            deal(job_info, 'salary', lambda x: x.find('p').text, job)
            deal(job_info, ['company_name', 'job_location'],
                 lambda x: x.find('p', class_='similar_company_name')['title'].split(' '), job)
            save_info('lagou_jobs', 'job_id', job_info)
        salary_range = [i for i in re.split('[-Kk·薪]', info['salary']) if i != '']
        return int(salary_range[0]) >= 12 and 40 > int(salary_range[1]) >= 17
    except Exception as e:
        exstr = traceback.format_exc()
        print(exstr)
        return False


def send_resume(url, info):
    driver.get(url)
    time.sleep(1)
    page = driver.page_source
    try:
        if get_job_info(page, info):
            driver.find_element_by_partial_link_text("投個簡歷").click()
            time.sleep(1)
            driver.find_element_by_partial_link_text("確認投遞").click()
    except Exception as e:
        pass


def run(mode, is_login=True):
    if is_login:
        login()

    if mode == 'start':
        for num, city in zip(cities_num, cities):
            url = f"{base_url}/jobs/list_%E6%95%B0%E6%8D%AE/p-city_{num}?px=default#filterBox"
            get_start_page(url, city)
            sleep()

    elif mode == 'send':

        while r.scard('lagou_jobs_set') != 0:
            job_id = r.srandmember("lagou_jobs_set")
            info = json.loads(r.hget('lagou_jobs_hash', job_id))
            print(f'------------- 公司:{info["company_name"]} --- 工作:{info["job_name"]}  ------------------------------')
            send_resume(f"{base_url}/jobs/{info['job_id']}.html", info)
            r.srem('lagou_jobs_set', job_id)
            time.sleep(random.choice([10, 15]))


if __name__ == '__main__':
    r = redis.Redis(password='123zxc')
    cities = ['北京', '上海', '杭州', '天津', '蘇州', '南京', '大連', '瀋陽']
    cities_num = [2, 3, 6, 4, 80, 79, 43, 44]
    USERNAME = 'xxxx'
    PASSWORD = 'xxxx'
    base_url = 'https://www.lagou.com'
    driver = webdriver.Chrome("/mnt/2D97AD940A9AD661/python_project/boss/chromedriver")
    driver.maximize_window()
    run('send')
    driver.quit()

相關文章