import pandas as pd

import numpy as np

import jieba

# 資料讀取

df = pd.read_excel(r'E:\python 爬蟲 \ 前程無憂招聘資訊 .xlsx',index_col=0)

# 資料去重與空值處理

df.drop_duplicates(subset=[' 公司名稱 ',' 崗位名稱 '],inplace=True)

df[df[' 招聘人數 '].isnull()]

df.dropna(how='all',inplace=True)

# 崗位名稱欄位處理

df[' 崗位名稱 '] = df[' 崗位名稱 '].apply(lambda x:x.lower())

counts = df[' 崗位名稱 '].value_counts()

target_job = [' 演算法 ',' 開發 ',' 分析 ',' 工程師 ',' 資料 ',' 運營 ',' 運維 ','it',' 倉庫 ',' 統計 ']

index = [df[' 崗位名稱 '].str.count(i) for i in target_job]

index = np.array(index).sum(axis=0) > 0

job_info = df[index]

job_list = [' 資料分析 '," 資料統計 "," 資料專員 ",' 資料探勘 ',' 演算法 ',' 大資料 ',' 開發工程師 ',

' 運營 ',' 軟體工程 ',' 前端開發 ',' 深度學習 ','ai',' 資料庫 ',' 倉庫管理 ',' 資料產品 ',

' 客服 ','java','.net','andrio','外匯跟單gendan5.com 人工智慧 ','c++',' 資料管理 '," 測試 "," 運維 "," 資料工程師 "]

job_list = np.array(job_list)

def Rename(x,job_list=job_list):

index = [i in x for i in job_list]

if sum(index) > 0:

return job_list[index][0]

else:

return x

job_info[' 崗位名稱 '] = job_info[' 崗位名稱 '].apply(Rename)

job_info[" 崗位名稱 "] = job_info[" 崗位名稱 "].apply(lambda x:x.replace(" 資料專員 "," 資料分析 "))

job_info[" 崗位名稱 "] = job_info[" 崗位名稱 "].apply(lambda x:x.replace(" 資料統計 "," 資料分析 "))

# 崗位薪資欄位處理

index1 = job_info[" 崗位薪資 "].str[-1].isin([" 年 "," 月 "])

index2 = job_info[" 崗位薪資 "].str[-3].isin([" 萬 "," 千 "])

job_info = job_info[index1 & index2]

job_info[' 平均薪資 '] = job_info[' 崗位薪資 '].astype(str).apply(lambda x:np.array(x[:-3].split('-'),dtype=float))

job_info[' 平均薪資 '] = job_info[' 平均薪資 '].apply(lambda x:np.mean(x))

# 統一工資單位

job_info[' 單位 '] = job_info[' 崗位薪資 '].apply(lambda x:x[-3:])

job_info[' 公司領域 '].value_counts()

def con_unit(x):

if x[' 單位 '] == " 萬 / 月 ":

z = x[' 平均薪資 ']*10000

elif x[' 單位 '] == " 千 / 月 ":

z = x[' 平均薪資 ']*1000

elif x[' 單位 '] == " 萬 / 年 ":

z = x[' 平均薪資 ']/12*10000

return int(z)

job_info[' 平均薪資 '] = job_info.apply(con_unit,axis=1)

job_info[' 單位 '] = ' 元 / 月 '

# 工作地點欄位處理

job_info[' 工作地點 '] = job_info[' 工作地點 '].apply(lambda x:x.split('-')[0])

# 公司領域欄位處理

job_info[' 公司領域 '] = job_info[' 公司領域 '].apply(lambda x:x.split('/')[0])

# 招聘人數字段處理

job_info[' 招聘人數 '] = job_info[' 招聘人數 '].apply(lambda x:x.replace(" 若干 ","1").strip()[1:-1])

# 工作經驗與學歷要求欄位處理

job_info[' 工作經驗 '] = job_info[' 工作經驗 '].apply(lambda x:x.replace(" 無需 ","1 年以下 ").strip()[:-2])

job_info[' 學歷需求 '] = job_info[' 學歷需求 '].apply(lambda x:x.split()[0])

# 公司規模欄位處理

job_info[' 公司規模 '].value_counts()

def func(x):

if x == ' 少於 50 人 ':

return "<50"

elif x == '50-150 人 ':

return "50-150"

elif x == '150-500 人 ':

return '150-500'

elif x == '500-1000 人 ':

return '500-1000'

elif x == '1000-5000 人 ':

return '1000-5000'

elif x == '5000-10000 人 ':

return '5000-10000'

elif x == '10000 人以上 ':

return ">10000"

else:

return np.nan

job_info[' 公司規模 '] = job_info[' 公司規模 '].apply(func)

# 公司福利欄位處理

job_info[' 公司福利 '] = job_info[' 公司福利 '].apply(lambda x:str(x).split())

# 職位資訊欄位處理

job_info[' 職位資訊 '] = job_info[' 職位資訊 '].apply(lambda x:x.split(' 職能類別 ')[0])

with open(r"E:\C++\ 停用詞表 .txt",'r',encoding = 'utf8') as f:

stopword = f.read()

stopword = stopword.split()

job_info[' 職位資訊 '] = job_info[' 職位資訊 '].apply(lambda x:x.lower()).apply(lambda x:"".join(x)).apply(lambda x:x.strip()).apply(jieba.lcut).apply(lambda x:[i for i in x if i not in stopword])

cons = job_info[' 公司領域 '].value_counts()

industries = pd.DataFrame(cons.index,columns=[' 行業領域 '])

industry = pd.DataFrame(columns=[' 分詞明細 ',' 行業領域 '])

for i in industries[' 行業領域 ']:

words = []

word = job_info[' 職位資訊 '][job_info[' 公司領域 '] == i]

word.dropna(inplace=True)

[words.extend(str(z).strip('\'[]').split("\', \'")) for z in word]

df1 = pd.DataFrame({' 分詞明細 ':words,

' 行業領域 ':i})

industry = industry.append(df1,ignore_index=True)

industry = industry[industry[' 分詞明細 '] != "\\n"]

industry = industry[industry[' 分詞明細 '] != ""]

count = pd.DataFrame(industry[' 分詞明細 '].value_counts())

lst = list(count[count[' 分詞明細 '] >=300].index)

industry = industry[industry[' 分詞明細 '].isin(lst)]

# 資料儲存

industry.to_excel(r'E:\python 爬蟲 \ 資料預處理 \ 詞雲 .xlsx')

job_info.to_excel(r'E:\python 爬蟲 \ 資料預處理 \ 前程無憂 ( 已清洗 ).xlsx')

前程無憂崗位資料爬取+Tableau視覺化分析

相關文章