# -*- coding: utf-8 -*-
import pandas as pd
import random
import datetime
import math
def pstrgen(ar):
if len(ar)==0:
return ['NA']
dic={}
for i in ar:
if i not in dic.keys():
dic[i]=0
dic[i]+=1
arr=[]
t=len(ar)
for i in dic.keys():
arr.append([i,dic[i]/t])
return arr
def cstrgen(ar):
if len(ar)==0:
return ['NA']
dic={}
for i in ar:
if i not in dic.keys():
dic[i]=0
dic[i]+=1
arr=[]
for i in dic.keys():
arr.append([i,dic[i]])
return arr
def typecount(ar):
dic={}
s=0
for i in ar:
if i[0] not in dic.keys():
dic[i[0]]=s
s+=1
return s
def numsplitfind(ar,n):
tc=typecount(ar)
if n>tc:
nn=tc
else:
nn=n
if len(ar)<=tc*tc*n:
return [],99999
n_max=ar[0][1]
n_min=ar[0][1]
for i in ar:
if i[1]>n_max:
n_max=i[1]
if i[1]<n_min:
n_min=i[1]
if n_max==n_min:
return [],99999
arp=[]
ttt=len(ar) // (tc*tc*n)
if ttt<10:
ttt=10
for i in range(ttt):
arp.append(round(n_min + (n_max-n_min) / (ttt+1) * (i+1),2))
arr=[[i] for i in arp]
for n in range(1,nn+1):
ars=[i for i in range(n)]
k_while=1
while k_while==1:
arst=[]
for i in ars:
arst.append(arp[i])
arst.sort()
kc=0
for i in range(n-1):
if arst[i]==arst[i+1]:
kc=1
break
if arst not in arr and kc==0:
arr.append(arst)
ars[-1]+=1
for i in range(n-1,-1,-1):
if ars[i]==ttt:
if i>0:
ars[i-1]+=1
ars[i]=0
if i==0:
k_while=0
if arr==[]:
return [],99999
ar_temp=[]
for n in range(len(arr)):
ar_temp.append([n,arr[n]])
art=[]
for i in ar:
art.append([i[0],numtostr(i[1],arr[n])])
arr_temp=arsplit(art)
arr_temp_p=[]
for artt in arr_temp:
arr_temp_p.append([artt[0],pstrgen(artt[1]),artt[2]])
arr_temp_h=[]
hx_modified=0
for artt in arr_temp_p:
arr_temp_h.append([artt[0],hxcal(artt[1]),artt[2]])
hx_modified += (arr_temp_h[-1][1] * arr_temp_h[-1][2])
ar_temp[-1].append(hx_modified)
if hx_modified<0.1:
break
k=-1
out=-1
for i in range(len(ar_temp)):
if i==0:
k=ar_temp[i][2]
out=i
else:
if ar_temp[i][2]<k:
k=ar_temp[i][2]
out=i
#print(ar_temp)
#print(ar_temp[out])
return ar_temp[out][1],ar_temp[out][2]
def hxcal(ar):
k=0
for i in ar:
s=i[1] * math.log2(i[1])
k-=s
return k
def arsplit(ar):
dic={}
arr=[]
s=0
t=len(ar)
for i in ar:
k=''
for j in range(1,len(i)):
k+=(i[j]+',')
if k not in dic.keys():
dic[k]=s
arr.append([k,[],0])
s+=1
ss=dic[k]
arr[ss][1].append(i[0])
arr[ss][2]+=1
for art in arr:
art[2]=art[2] / t
return arr
def gainmatrixgen(df,out_column,winners_t,number_columns):
art=df.columns.tolist()
for i in art:
if i==out_column:
ar=df[i].values.tolist()
arr=pstrgen(ar)
hx_base=hxcal(arr)
if hx_base<=0.1:
return [],['Finished']
#print(hx_base)
break
artt=[]
#print('')
for i in art:
if i!=out_column and i not in winners_t:
if i in number_columns:
#print(i)
ar_temp=df[[out_column,i]].values.tolist()
ard,hx_modified=numsplitfind(ar_temp,5)
artt.append([i,round(hx_base-hx_modified,4),ard])
else:
ar_temp=df[[out_column,i]].values.tolist()
arr_temp=arsplit(ar_temp)
arr_temp_p=[]
for ar in arr_temp:
arr_temp_p.append([ar[0],pstrgen(ar[1]),ar[2]]) #######################
arr_temp_h=[]
hx_modified=0
for ar in arr_temp_p:
arr_temp_h.append([ar[0],hxcal(ar[1]),ar[2]])
hx_modified += (arr_temp_h[-1][1] * arr_temp_h[-1][2])
#print(arr_temp_h)
artt.append([i,round(hx_base-hx_modified,4)])
t=0
ti=-1
for i in range(len(artt)):
if artt[i][1]>t:
t=artt[i][1]
ti=i
if artt[ti][1]<=0.1:
return [],['Finished']
return artt,artt[ti]
def datasetread(dz,t_rate,ncn,rr):
df=pd.read_excel(dz,dtype=dict.fromkeys(list(pd.read_excel(dz).columns),'str')).fillna('')
for k in df.columns.tolist():
if k in ncn:
df[k]=df[k].replace('','0')
df[k]=df[k].astype(float)
if t_rate>0.5:
tr=1-t_rate
else:
tr=t_rate
art=df.columns.tolist()
arr=df.values.tolist()
t=len(arr)-1
k=int(round(t*tr,0))
ar=[]
if rr==0:
for i in range(k):
s=random.randint(0,t)
ar.append(arr[s])
arr.pop(s)
t-=1
elif rr==1:
for i in range(k):
if t_rate<=0.5:
ar.append(arr[0])
arr.pop(0)
else:
ar.append(arr[-1])
arr.pop(len(arr)-1)
elif rr==-1:
for i in range(k):
if t_rate<=0.5:
ar.append(arr[-1])
arr.pop(len(arr)-1)
else:
ar.append(arr[0])
arr.pop(0)
if t_rate<=0.5:
df_train=pd.DataFrame(ar,columns=art)
df_test=pd.DataFrame(arr,columns=art)
else:
df_train=pd.DataFrame(arr,columns=art)
df_test=pd.DataFrame(ar,columns=art)
return df_train,df_test
def numtostr(n,ard):
s=''
for i in range(len(ard)):
if n<ard[i]:
if i==0:
s='<' + str(ard[i])
break
elif i>0 and i<len(ard):
s= '>=' + str(ard[i-1]) + ' and ' + '<' + str(ard[i])
break
elif n>=ard[i] and i==len(ard)-1:
s='>=' + str(ard[-1])
break
return s
def df_num_change(df,cn,ard):
art=df.columns.tolist()
j=art.index(cn)
ar=df.values.tolist()
for i in ar:
if str(i[j])[0]!='<' and str(i[j])[0]!='>':
i[j]=numtostr(i[j],ard)
return pd.DataFrame(ar,columns=art)
def dfsplit(df,arw,lll):
art=[[],[]]
for i in arw:
art[0].append(i[0])
if len(i)==3:
art[1].append([])
elif len(i)==4:
art[1].append(i[2])
df_s=df.groupby(art[0])
dicg={}
for gp in df_s.groups:
if len(arw)==1:
dicg[(gp,)]=df_s.get_group(gp)
else:
dicg[gp]=df_s.get_group(gp)
for df_t in dicg.values():
df_t.reset_index(drop=True,inplace=True)
return dicg
# =============================================================================
# def winners_go(arw):
# art=[[i] for i in arw[0]]
# t=len(arw)
# for i in range(len(arw)-1,-1,-1):
# if arw[i]==[]:
# t-=1
# if t==1:
# return art
#
# for i in range(1,t):
# arr=[]
# for n1,i1 in enumerate(art):
# for n2,i2 in enumerate(arw[i]):
# if (i1+[i2]) not in arr:
# arr.append(i1+[i2])
# art=[]
# art=[k for k in arr]
# arr=[]
# for i in range(len(art)):
# k=1
# for t in art[i]:
# if 'Finished' in t:
# k=0
# break
# if k==1 and art[i] not in arr:
# arr.append(art[i])
# #if 'Finished' not in art[i][-1]:
# # arr.append(art[i])
# return arr
# =============================================================================
def layers_go(arl,lll):
arr=[]
for i in arl:
if len(i)==lll:
if 'Finished' not in i[-1]:
arr.append(i)
return arr
def layer_winer_find(df,layern,out_column,number_columns,first_layer):
layers=[]
layers_dic={}
layers_name=[]
layers_name_sl=[]
winners=[[] for i in range(layern)]
for lll in range(layern):
sl=0
if lll==0:
if first_layer=='':
gainmatrix,winner = gainmatrixgen(df,out_column,[],number_columns)
if len(winner)==3:
df=df_num_change(df,winner[0],winner[2])
winners[lll].append(winner + [sl])
layers.append([lll,['None'],winner])
layers_name.append([winner])
layers_name_sl.append([winner+ [sl]])
gps_dic={}
layers_dic[(lll,sl)]=df
#print(winner)
else:
if first_layer in number_columns:
ar_temp=df[[out_column,first_layer]].values.tolist()
ard,hx_modified=numsplitfind(ar_temp,5)
df=df_num_change(df,first_layer,ard)
#print(ard)
#print(hx_modified)
winner=[first_layer,hx_modified,ard]
winners[lll].append(winner + [sl])
layers.append([lll,['None'],winner])
layers_name.append([winner])
layers_name_sl.append([winner+ [sl]])
gps_dic={}
layers_dic[(lll,sl)]=df
else:
winner=[first_layer,1]
winners[lll].append(winner + [sl])
layers.append([lll,['None'],winner])
layers_name.append([winner])
layers_name_sl.append([winner+ [sl]])
gps_dic={}
layers_dic[(lll,sl)]=df
#print(winner)
else:
#winners_total=winners_go(winners)
winners_total=layers_go(layers_name_sl,lll)
#print(winners_total)
for winners_t in winners_total:
#print(lll,winners_t)
df_t=layers_dic[(lll-1,winners_t[-1][-1])]
gps_dic=dfsplit(df_t,winners_t,lll)
#winners_c=[i[0] for i in winners_t]
#print(winners_c)
kkk=[i[0] for i in winners_t]
for k in gps_dic.keys():
gainmatrix,winner = gainmatrixgen(gps_dic[k],out_column,kkk,number_columns)
if len(winner)==3:
gps_dic[k]=df_num_change(gps_dic[k],winner[0],winner[2])
winners[lll].append(winner + [sl])
layers.append([lll,list(k),winner])
layers_name.append(winners_t+[winner])
layers_name_sl.append(winners_t+[winner+[sl]])
layers_dic[(lll,sl)]=gps_dic[k]
sl+=1
#if winner[0]=='Finished' or lll==layern-1:
# print(layers[-1])
return winners,layers,layers_dic,layers_name
def train_set_out(ar_layer,dic_layer_df,out_dz,layer_number):
kk=0
df=pd.DataFrame()
s=0
for k in dic_layer_df.keys():
if ar_layer[s][0]==layer_number-1 or ar_layer[s][-1]==['Finished']:
kk+=dic_layer_df[k].shape[0]
df=pd.concat([df,dic_layer_df[k]])
s+=1
out_dz=r'C:\101 claim type match\testtest.xlsx'
df.to_excel(out_dz,index=False)
def layer_to_model(arw,arl,arln,dic_layer_df,out_column,layer_number,number_columns):
artc=[]
for i in arln:
art=[]
for j in i:
art.append(j[0])
artc.append(art)
#print(artc)
s=0
arm=[]
for k in dic_layer_df.keys():
if arl[s][-1]==['Finished']:
df_t=dic_layer_df[k]
arp=pstrgen(df_t[out_column].values.tolist())
arc=cstrgen(df_t[out_column].values.tolist())
arm.append([artc[s][:-1],arl[s][1],arp,arc])
elif arl[s][0]==layer_number-1:
df_t=dic_layer_df[k]
gps_dic=dfsplit(df_t,arln[s],layer_number)
for gpk in gps_dic.keys():
arp=pstrgen(gps_dic[gpk][out_column].values.tolist())
arc=cstrgen(gps_dic[gpk][out_column].values.tolist())
arm.append([artc[s],list(gpk),arp,arc])
s+=1
#print(arm)
arm_full=[]
for k in arm:
for n,kk in enumerate(k[2]):
art=['' for i in range(layer_number*2+3)]
for i in range(layer_number):
if i<len(k[0]):
art[i*2]=k[0][i]
if i<len(k[1]):
art[i*2+1]=k[1][i]
art[-3]=kk[0]
art[-2]=kk[1]
art[-1]=k[3][n][-1]
arm_full.append(art)
#print(arm_full)
art=[]
for i in range(layer_number):
art.append('layer' + str(i+1) + ' name')
art.append('layer' + str(i+1) + ' value')
art.append('class')
art.append('p')
art.append('count')
df_m_full=pd.DataFrame(arm_full,columns=art)
return df_m_full
def full_to_max(df):
ar=df.values.tolist()
art=df.columns.tolist()
dic={}
for i in ar:
k=tuple(i[:-3])
if k not in dic.keys():
dic[k]=['',0,0]
if int(i[-1])>int(dic[k][2]):
dic[k]=i[-3:]
arr=[]
for i in dic.keys():
arr.append(list(i)+dic[i])
dfm=pd.DataFrame(arr,columns=art)
return dfm
def typefind(ard,arm,art):
for m in range(len(arm)-3):
if m % 2 == 0 :
if arm[m]!='':
try:
j=art.index(arm[m])
except:
return False
else:
break
elif m % 2 ==1:
k=arm[m]
if len(k)>=2:
if k[:2]=='>=' or \
(k[0]=='<' and k[1] in '-1234567890'):
ar_temp=k.split(' and ')
if len(ar_temp)==2:
kk1=float(ar_temp[0][2:])
kk2=float(ar_temp[1][1:])
if kk1>=ard[j] or kk2<ard[j]:
return False
elif len(ar_temp)==1:
if ar_temp[0][:2]=='>=':
kk=float(ar_temp[0][2:])
if kk>=ard[j]:
return False
elif ar_temp[0][:1]=='<':
kk=float(ar_temp[0][1:])
if kk<ard[j]:
return False
else:
if k!=ard[j]:
return False
else:
if k!=ard[j]:
return False
return True
def model_compress(arm):
dic={}
t=len(arm[0])-3
for i in arm:
if tuple(i[:t-2]+[i[-3]]) not in dic.keys():
dic[tuple(i[:t-2]+[i[-3]])]=0
dic[tuple(i[:t-2]+[i[-3]])]+=i[-1]
dic2={}
for k in dic.keys():
if k[:t-2] not in dic2.keys():
dic2[k[:t-2]]=['',0,0]
if dic[k] > dic2[k[:t-2]][2]:
dic2[k[:t-2]]=[k[-1],0,dic[k]]
ar=[]
for k in dic2.keys():
ar.append(list(k)+dic2[k])
return ar
def model_use(dfd,dfm,optim): #optim 1 compress model 0 not compress
art=dfd.columns.tolist()
ard=dfd.values.tolist()
arm=dfm.values.tolist()
for i in ard:
i.append('')
for m in arm:
if typefind(i,m,art)==True:
i[-1]=m[-3]
break
if optim==1:
while True:
arm=model_compress(arm)
for i in ard:
if i[-1]=='':
for m in arm:
if typefind(i,m,art)==True:
i[-1]=m[-3]
break
if len(arm[0])==5:
break
art.append('guess_type')
df=pd.DataFrame(ard,columns=art)
return df
def model_statistic(df,out_column):
ard=df.values.tolist()
art=df.columns.tolist()
s=0
s2=0
for i in ard:
if i[-1]==i[art.index(out_column)]:
s+=1
if i[-1]=='':
s2+=1
ss=round(s / len(ard) * 100,2)
ss2=round(s2 / len(ard) * 100,2)
ss3=round((len(ard)-s-s2) / len(ard) * 100,2)
sss=round(ss / (ss + ss3) * 100,2)
sss3=round(ss3 / (ss + ss3) * 100,2)
print('Classified ' + str(len(ard)) + ' items')
print('Accurate rate: ' + str(ss) + '% (' + str(sss) + '%)')
print('Error rate: ' + str(ss3) + '% (' + str(sss3) + '%)')
print('Missing rate: ' + str(ss2) + '%')
print('-------------------------------')
bg_dt = datetime.datetime.now()
#-------------------------------------------------------------------------------------
layer_number=3 #我需要最多多少個列確定最終的結果,層數。決定的是樹的深度
first_layer='' #指定根列為起點的位置在哪一列,可以是空值,就不人為指定就機器自己挑。預設可以用空值
trainset_rate=0.8
rnd=1 #0 random 1 first rows -1 last rows #0 random隨機取80% 1 first rows取前80% -1 last rows取後80%
out_column='claim type'
number_columns=[]
#number_columns=['不含稅價款(元)','稅額(元)'] #裡面所有數字型別的列要放這裡,分類列不用寫這裡。如果有日期要提前轉成數字,放這裡。
dz=r'C:\Users\class model\dataset.xlsx' #預設使用這個檔案裡的第一個sheet頁
#dz=r'C:\Users\101 claim type match\ttt.xlsx'
#dz=r'C:\Users\101 claim type match\datasetall.xlsx'
out_dz_full=r'C:\class model\full_model.xlsx' #把所有可能性都窮盡出來
out_dz_max=r'C:\class model\max_model.xlsx' #↑保留機率最大的
#-------------------------------------------------------------------------------------
df_train,df_test=datasetread(dz,trainset_rate,number_columns,rnd)
#print(df_train)
#print(df_test)
if layer_number>len(df_train.columns.tolist())-1:
layer_number=len(df_train.columns.tolist())-1
ar_winner,ar_layer,dic_layer_df,ar_layer_name=layer_winer_find(df_train,layer_number,out_column,number_columns,first_layer)
#print(ar_winner)
#print(ar_layer)
#print(ar_layer_name)
#print(len(ar_layer))
#print(len(ar_layer_name))
#print(dic_layer_df.keys())
#print(ar_layer_name)
#print(ar_layer)
#out_dz=r'C:\101 claim type match\testtest.xlsx'
#train_set_out(ar_layer,dic_layer_df,out_dz,layer_number)
df_model_full=layer_to_model(ar_winner,ar_layer,ar_layer_name,dic_layer_df,out_column,layer_number,number_columns)
df_model_full.to_excel(out_dz_full,index=False)
df_model_max=full_to_max(df_model_full)
df_model_max.to_excel(out_dz_max,index=False)
df_train=model_use(df_train,df_model_max,0)
print('-------------------------------')
print('Train Data Set')
model_statistic(df_train,out_column)
df_test_n=model_use(df_test,df_model_max,0)
print('-------------------------------')
print('Test Data Set')
model_statistic(df_test_n,out_column)
df_test_m=model_use(df_test,df_model_max,1)
print('-------------------------------')
print('Test Data Set (Maxmized)')
model_statistic(df_test_m,out_column)
#out_dz=r'C:\101 claim type match\train data.xlsx'
#df_train.to_excel(out_dz,index=False)
#out_dz=r'C:\101 claim type match\test data.xlsx'
#df_test.to_excel(out_dz,index=False)
diff=datetime.datetime.now()-bg_dt
diff_s=diff.days*24*60*60 + diff.seconds
print('Finished in ' + str(diff_s // 60) + ' minutes')