2.ktacgen

妖怪梧桐發表於2024-11-22

# -*- coding: utf-8 -*-
import pandas as pd
import os
import datetime
import copy
#0,1,2, ,3,4,5,6
def pcgen(dz): #0,1,2,3,4,5,6,7
ar_f=pd.read_excel(dz,sheet_name='PC CC Master',usecols='A,O,Q,R,S,T,W,Y', \
names=['A','B','C','D','E','F','G','H'], \
dtype={'A':str,'B':str,'C':str,'D':str,'E':str,'F':str,'G':str,'H':str}).fillna('')
ar=ar_f.values.tolist()
dic={}
for arr in ar:
if len(arr[0])==8 and arr[0][0]=='1':
dic[arr[0]]=[arr[2],arr[4],arr[5],arr[6],arr[7],arr[3]]
return dic

def stfgen(dz):
ar_f=pd.read_excel(dz,dtype=dict.fromkeys(list(pd.read_excel(dz).columns),'str')).fillna('')
ar=ar_f.values.tolist()
dic={}
for i in ar:
dic[i[0]]=i[5]
return dic

def mapgen(dz):
ar_f=pd.read_excel(dz).fillna('')
ar=ar_f.values.tolist()
return ar

def stfggen(dz):
ar_f=pd.read_excel(dz,dtype=dict.fromkeys(list(pd.read_excel(dz).columns),'str')).fillna('')
ar=ar_f.values.tolist()
dic={}
dicm={}
for k in ar:
if '0' in k[1] or '1' in k[1] or '2' in k[1] or '3' in k[1] \
or '4' in k[1] or '5' in k[1] or '6' in k[1] or '7' in k[1] \
or '8' in k[1] or '9' in k[1]:
dic[k[1].strip()]=[k[27],k[2]]
dicm[k[1].strip()]=[k[12],k[18]]
return dic,dicm

def bwread(dz):
out=[]
with open(dz,'r',encoding='UTF-8') as filein:
art=[]
while True:
txt=filein.readline()
if not txt:
break
if txt[:3]==r'<tr':
art=[]
jj=0
if txt[:13]==r'<td class="3Dx"':
if r'x:num=3D' in txt:
a=txt.find(r'"')+1
b=txt.find(r'"',a)
art.append(float(txt[a:b]))
jj+=1
else:
a=txt.find(r'>')+1
b=txt.find(r'</td>',a)
tt=txt[a:b]

tt=tt.replace('4700/', '')
tt=tt.replace('<br>', ' ')
tt=tt.replace('&#32;', ' ')
tt=tt.replace('&lt;','<')
tt=tt.replace('&gt;','>')
tt=tt.replace('&amp','&')
tt=tt.replace('&;','&')

tt=tt.replace(' ',' ')
tt=tt.strip()
art.append(tt)
jj+=1

if r'colspan' in txt:
a=txt.find(r'colspan')+10
b=txt.find(r' ',a+1)
for i in range(int(txt[a:b])-1):
art.append("")
jj+=1
if txt[:5]==r'</tr>':
if len(art)>0:
out.append(art)
return out

def bwfolderread(dz):
arr=[]
for rr,dd,ff in os.walk(dz):
pass
for f in ff:
dz_t=os.path.join(rr,f)
ar_t=bwread(dz_t)
arr+=ar_t
return arr

#################################################################################################
def clmgen(ar,art):
dicd={}
dicn={}

for i in ar:
if len(i)>=5 and i[0]!='' and i[0]!='period':

if i[3] not in dicd.keys():
dicd[i[3]]=[]
dicn[i[3]]=[[],[],0,0]

if i[4] not in dicd[i[3]]:
dicd[i[3]].append(i[4])

dicn[i[3]][0].append(i[4])
dicn[i[3]][1].append(float(i[13]))
dicn[i[3]][2]+=1
dicn[i[3]][3]+=float(i[13])


for k in dicd.keys():
kk=0
ard=dicd[k]
for i in ard:
if i in art:
kk+=1
if kk>1:
# =============================================================================
# print(dicd[k])
# print(dicn[k])
# print('-------')
# =============================================================================
artemp=[0 for i in art]
for i in range(len(dicn[k][0])):
if dicn[k][0][i] in art:
kt=art.index(dicn[k][0][i])
artemp[kt]+=dicn[k][1][i]
ktt=0
for i in range(1,len(artemp)):
if artemp[i]>artemp[ktt]:
ktt=i
for i in range(len(dicd[k])-1,-1,-1):
if dicd[k][i] in art:
if dicd[k][i]!=art[ktt]:
dicd[k].pop(i)
for i in range(len(dicn[k][0])):
if dicn[k][0][i] in art:
dicn[k][0][i]=art[ktt]
# =============================================================================
# print(dicd[k])
# print(dicn[k])
# print('-------------------')
# =============================================================================


# =============================================================================
# for k in dicd.keys():
# if len(dicd[k])>1:
# print(k)
# print(dicd[k])
# print(dicn[k])
# print('--------------------------')
# =============================================================================

return dicd,dicn

def vat_data_process(df,dicd,dicn,dicp,dics,dicstm,dfm,arst,dic_te):
ar=df.values.tolist()
art=df.columns.tolist()

k=art.index('貨物或應稅勞務、服務名稱')
k2=art.index('條形碼')
k3=art.index('發票程式碼')
k4=art.index('CNY')
k5=art.index('Commodity')
arr=[]
dicn_kt={}
for i in ar:
###################################################
if '*' in i[k]:
artt=i[k].split('*')
elif '(' in i[k]:
artt=i[k].split('(')
elif '(' in i[k]:
artt=i[k].split('(')
else:
artt=[i[k]]
dictt={}
arttt=[]

for t in range(len(artt)):
artt[t]=artt[t].strip()
if artt[t]!='' \
and len(artt[t])<=12 \
and '0' not in artt[t] and '1' not in artt[t] and '2' not in artt[t] \
and '3' not in artt[t] and '4' not in artt[t] and '5' not in artt[t] \
and '6' not in artt[t] and '7' not in artt[t] and '8' not in artt[t] \
and '9' not in artt[t] and '[' not in artt[t] and '【' not in artt[t]:
if artt[t] not in dictt.keys():
dictt[artt[t]]=t
arttt.append(artt[t])

if arttt==[]:
t=i[k]
if '房' in t and '租' in t:
arttt.append('經營租賃')
arttt.append('房租')
elif '租金' in t:
arttt.append('經營租賃')
arttt.append('房租')
elif '租車' in t:
arttt.append('經營租賃')
arttt.append('租車')
elif 'A3' in t or 'A4' in t:
arttt.append('紙製品')
elif '筆' in t:
arttt.append('文具')
else:
arttt=[i[k]]

arttt.append('')
arttt.append('')

if '房' in arttt[0] and '租' in arttt[0]:
arttt[0]='經營租賃'
arttt[1]='房租'


###################################################
bn=i[k2][8:18]
if bn not in dicn_kt.keys():
dicn_kt[bn]=[[],[],0,0]
dicn_kt[bn][1].append(i[k4])
dicn_kt[bn][2]+=1
dicn_kt[bn][3]+=i[k4]


sn=i[k2][:8]
fr=i[k3][:4]
if sn in dicstm.keys():
sg=dicstm[sn][0]
pc='1'+dicstm[sn][1][1:]
else:
sg=''
pc=''

if pc in dicp.keys():
pc_s=dicp[pc][2]
pc_r=dicp[pc][4]
else:
pc_s=''
pc_r=''

arr.append(i)
arr[-1].append(arttt[0])
arr[-1].append(arttt[1])
arr[-1].append(arttt[2])
arr[-1].append(sn)
arr[-1].append(fr)
arr[-1].append(sg)
arr[-1].append(pc)
arr[-1].append(pc_s)
arr[-1].append(pc_r)

art+=['type1','type2','type3','staff code','發票程式碼region','Grade','PC','Service','Region']
#print(dicp)
#print(dics)

#print(dicstm)
df=pd.DataFrame(arr,columns=art)
df=model_use(df,dfm,1)

arr=df.values.tolist()
art=df.columns.tolist()

#print(dicn)
#print(dicn_kt)

for i in arr:
bn=i[k2][8:18]
i.append('')
i.append('')
if bn in dicd.keys():
if len(dicd[bn])==1:
i[-2]='1v1'
i[-1]=dicd[bn][0]
if i[-1]=='':
if bn in dicn.keys():
if dicn[bn][2]==dicn_kt[bn][2] and dicn[bn][3]==dicn_kt[bn][3]:
if i[k4] in dicn[bn][1]:
i[-2]='amount matched'
i[-1]=dicn[bn][0][dicn[bn][1].index(i[k4])]
if i[-1]=='':
if i[-3]!='':
i[-2]='model generate'
i[-1]=i[-3]
if i[-3] in arst:
if bn in dicd.keys():
for k in dicd[bn]:
if k in arst:
if k!=i[-1]:
i[-2]='special type'
i[-1]=k
if i[-1]!='':
i[k5]=dic_te[i[-1]]

art+=['claim type source','claim type']
df=pd.DataFrame(arr,columns=art)

return df

###############################################################################################
def modelread(dz):
df=pd.read_excel(dz,dtype=dict.fromkeys(list(pd.read_excel(dz).columns),'str')).fillna('')
ncn=df.columns.tolist()[-2:]
for k in df.columns.tolist():
if k in ncn:
df[k]=df[k].replace('','0')
df[k]=df[k].astype(float)
return df

def model_use(dfd,dfm,optim): #optim 1 compress model 0 not compress
art=dfd.columns.tolist()
ard=dfd.values.tolist()

arm=dfm.values.tolist()

for i in ard:
i.append('')
for m in arm:
if typefind(i,m,art)==True:
i[-1]=m[-3]
break
if optim==1:
while True:
arm=model_compress(arm)
for i in ard:
if i[-1]=='':
for m in arm:
if typefind(i,m,art)==True:
i[-1]=m[-3]
break
if len(arm[0])==5:
break

art.append('guess_type')
df=pd.DataFrame(ard,columns=art)
return df

def typefind(ard,arm,art):
for m in range(len(arm)-3):
if m % 2 == 0 :
if arm[m]!='':
try:
j=art.index(arm[m])
except:
return False
else:
break
elif m % 2 ==1:
k=arm[m]
if len(k)>=2:
if k[:2]=='>=' or \
(k[0]=='<' and k[1] in '-1234567890'):
ar_temp=k.split(' and ')
if len(ar_temp)==2:
kk1=float(ar_temp[0][2:])
kk2=float(ar_temp[1][1:])
if kk1>=ard[j] or kk2<ard[j]:
return False
elif len(ar_temp)==1:
if ar_temp[0][:2]=='>=':
kk=float(ar_temp[0][2:])
if kk>=ard[j]:
return False
elif ar_temp[0][:1]=='<':
kk=float(ar_temp[0][1:])
if kk<ard[j]:
return False
else:
if k!=ard[j]:
return False
else:
if k!=ard[j]:
return False

return True


def model_compress(arm):
dic={}
t=len(arm[0])-3
for i in arm:
if tuple(i[:t-2]+[i[-3]]) not in dic.keys():
dic[tuple(i[:t-2]+[i[-3]])]=0
dic[tuple(i[:t-2]+[i[-3]])]+=i[-1]

dic2={}
for k in dic.keys():
if k[:t-2] not in dic2.keys():
dic2[k[:t-2]]=['',0,0]
if dic[k] > dic2[k[:t-2]][2]:
dic2[k[:t-2]]=[k[-1],0,dic[k]]

ar=[]
for k in dic2.keys():
ar.append(list(k)+dic2[k])
return ar

#################################################################################################


def datetrim(dt):
dd=dt.strip()
return dd[:10]

def vatairgen(dz,dicp,dics,arm,dicd,dicn,dicstm,dfm,arst,dic_te):
ar_f1=pd.read_excel(dz,sheet_name='vat', \
dtype=dict.fromkeys(list(pd.read_excel(dz,sheet_name='vat').columns),'str')).fillna('')
ar_f1=ar_f1[['發票唯一代號','條形碼','發票程式碼','發票號碼','開票日期','合計不含稅價款(元)',
'合計稅額(元)','價稅合計金額(元)','銷售方納稅人識別號','銷售方開戶行及賬號',
'銷售方名稱','銷售方地址、電話','購買方納稅人識別號','購買方名稱','發票型別',
'是否重票錄入','操作日期','貨物或應稅勞務、服務名稱','規格型號','單位',
'數量','單價(元)','稅率(%)','調整後不含稅價款(元)','調整後稅款(元)']]
ar_f1.columns=['發票唯一代號','條形碼','發票程式碼','發票號碼','開票日期','合計不含稅價款(元)',
'合計稅額(元)','價稅合計金額(元)','銷售方納稅人識別號','銷售方開戶行及賬號',
'Vendor name','銷售方地址、電話','購買方納稅人識別號','購買方名稱','發票型別',
'是否重票錄入','Posting Date','貨物或應稅勞務、服務名稱','規格型號','單位',
'數量','單價(元)','稅率(%)','不含稅價款(元)','稅額(元)']
art_f1=ar_f1.columns.tolist()
ar_f1['Source']='VAT'
ar_f1['Commodity']='' #ar_f1['貨物或應稅勞務、服務名稱'].apply(catamatch,args=(arm,))
ar_f1['不含稅價款(元)']=ar_f1['不含稅價款(元)'].astype('float')
ar_f1['稅額(元)']=ar_f1['稅額(元)'].astype('float')
ar_f1['CNY']=ar_f1['不含稅價款(元)']+ar_f1['稅額(元)']
ar_f1['Posting Date']=ar_f1['Posting Date'].apply(datetrim)

#print(ar_f1)

ar_f1=vat_data_process(ar_f1,dicd,dicn,dicp,dics,dicstm,dfm,arst,dic_te)

ar_f1_out=copy.copy(ar_f1)

ar_f1=ar_f1[art_f1+['Source','Commodity','CNY']]

#print(ar_f1)



ar_f2=pd.read_excel(dz,sheet_name='air', \
dtype=dict.fromkeys(list(pd.read_excel(dz,sheet_name='air').columns),'str')).fillna('')
ar_f2=ar_f2[['發票唯一代號','條形碼','電子客票號碼','填開日期','銷售單位代號','填開單位',
'總額','操作日期']]
ar_f2.drop_duplicates(subset=['電子客票號碼'], keep='first',inplace=True)
ar_f2.columns=['發票唯一代號','條形碼','發票號碼','開票日期','銷售方納稅人識別號',
'Vendor name','價稅合計金額(元)','Posting Date']
ar_f2['Source']='Air'
ar_f2['Commodity']='Air'
ar_f2['CNY']=ar_f2['價稅合計金額(元)']
ar_f2['Posting Date']=ar_f2['Posting Date'].apply(datetrim)

df=pd.concat([ar_f1,ar_f2], axis=0).fillna('')
ar=df.values.tolist()
art=df.columns.tolist()
for i in ar:
if i[art.index('Vendor name')]=='':
i[art.index('Vendor name')]='No Vendor'
#i[art.index('Posting Date')]=i[art.index('Posting Date')][:10]


k=i[art.index('條形碼')]
if len(k)!=20 or k.isdigit()==False:
i.append('')
i.append('')
i.append('')
i.append('')
i.append('')
else:
i.append(k[:8])
i.append(k[8:])

if i[-2] in dics.keys():
i.append(dics[i[-2]][-8:])
else:
i.append('')
kk='1' + i[-1][-7:]
if i[-1]!='' and kk in dicp.keys():
i.append(dicp[kk][3])
i.append(dicp[kk][5])
else:
i.append('')
i.append('')

art.append('Staff Num')
art.append('Claim Num')
art.append('Requestor PC')
art.append('Requestor SG')
art.append('Requestor Office')

df=pd.DataFrame(ar,columns=art)
return df,ar_f1_out

def vatairout(dz,ary,dicp,dics,arm,vmdz,dicd,dicn,dicstm,dfm,arst,dic_te,dzvat): #dzo,
# =============================================================================
# if os.path.exists(dzo)==True:
# os.remove(dzo)
# =============================================================================
dic_out={}

k=0
for yyy in ary:
dz_y=os.path.join(dz,yyy)
ff=os.listdir(dz_y)
for f in ff:
dz_t=os.path.join(dz_y,f)
df,df_vat=vatairgen(dz_t,dicp,dics,arm,dicd,dicn,dicstm,dfm,arst,dic_te)

df=vat_data_tag(df,vmdz)

#df.to_excel('test.xlsx',index=False)
#dzoo=os.path.join(dzo,'vatair' + str(k) + ' ' + f[:f.rindex('.')] + '.xlsx') ################################
dzov=os.path.join(dzvat,'vat' + str(k) + ' ' + f[:f.rindex('.')] + '.xlsx')
#dftotxt(df,dzoo,k)
#df.to_excel(dzoo,index=False) ########################################
df_vat.to_excel(dzov,index=False)
dic_out[str(k) + ' ' + f[:f.rindex('.')] + '.xlsx']=df
#print('vatair',f,'Outputed') ######################################
k+=1
return dic_out


def othergen(dz,dicp,dics):
ar_f1=pd.read_excel(dz,sheet_name='quota', \
dtype=dict.fromkeys(list(pd.read_excel(dz,sheet_name='quota').columns),'str')).fillna('')
ar_f1=ar_f1[['批次號','發票唯一代號','條形碼','發票程式碼','發票號碼','金額(元)','錄入時間',
'人工備註','圖片原件','原始檔','操作員','操作日期']]
ar_f1.columns=['批次號','發票唯一代號','條形碼','發票程式碼','發票號碼','金額(元)','錄入時間',
'人工備註','圖片原件','原始檔','操作員','Posting Date']
ar_f1['Source']='Fixed Invoice'
ar_f1['Commodity']='Others'
ar_f1['Posting Date']=ar_f1['Posting Date'].apply(datetrim)

ar_f2=pd.read_excel(dz,sheet_name='train', \
dtype=dict.fromkeys(list(pd.read_excel(dz,sheet_name='train').columns),'str')).fillna('')
ar_f2=ar_f2[['批次號','發票唯一代號','條形碼','發票號碼','金額(元)','乘車日期','乘車時間',
'乘車人姓名','車次','座位級別','始發站','終點站','錄入時間','人工備註',
'圖片原件','原始檔','操作員','操作日期']]
ar_f2.columns=['批次號','發票唯一代號','條形碼','發票號碼','金額(元)','乘車日期','乘車時間',
'乘車人','車次','座位級別','始發站','終點站','錄入時間','人工備註',
'圖片原件','原始檔','操作員','Posting Date']
ar_f2['Source']='Train'
ar_f2['Commodity']='Travel Miscellaneous'
ar_f2['Posting Date']=ar_f2['Posting Date'].apply(datetrim)

ar_f3=pd.read_excel(dz,sheet_name='taxi', \
dtype=dict.fromkeys(list(pd.read_excel(dz,sheet_name='taxi').columns),'str')).fillna('')
ar_f3=ar_f3[['批次號','發票唯一代號','條形碼','發票程式碼','發票號碼','金額(元)','乘車日期',
'里程','上車時間','下車時間','發票所在地','錄入時間', '人工備註','圖片原件',
'原始檔','操作員','操作日期']]
ar_f3.columns=['批次號','發票唯一代號','條形碼','發票程式碼','發票號碼','金額(元)','乘車日期',
'里程','上車時間','下車時間','發票所在地','錄入時間', '人工備註','圖片原件',
'原始檔','操作員','Posting Date']
ar_f3['Source']='Taxi'
ar_f3['Commodity']='Travel Miscellaneous'
ar_f3['Posting Date']=ar_f3['Posting Date'].apply(datetrim)

ar_f4=pd.read_excel(dz,sheet_name='bus', \
dtype=dict.fromkeys(list(pd.read_excel(dz,sheet_name='bus').columns),'str')).fillna('')
ar_f4=ar_f4[['批次號','發票唯一代號','條形碼','發票程式碼','發票號碼','日期','時間','出發車站',
'到達車站','總金額','姓名','錄入時間','人工備註','圖片原件','原始檔','操作員','操作日期']]
ar_f4.columns=['批次號','發票唯一代號','條形碼','發票程式碼','發票號碼','乘車日期','乘車時間','始發站',
'終點站','金額(元)','乘車人','錄入時間','人工備註','圖片原件','原始檔','操作員','Posting Date']
ar_f4['Source']='Car'
ar_f4['Commodity']='Travel Miscellaneous'
ar_f4['Posting Date']=ar_f4['Posting Date'].apply(datetrim)

ar_f5=pd.read_excel(dz,sheet_name='other', \
dtype=dict.fromkeys(list(pd.read_excel(dz,sheet_name='other').columns),'str')).fillna('')
ar_f5=ar_f5[['批次號','發票唯一代號','條形碼','發票程式碼','發票號碼','總金額','錄入時間','人工備註',
'圖片原件','原始檔','操作員','操作日期']]
ar_f5.columns=['批次號','發票唯一代號','條形碼','發票程式碼','發票號碼','金額(元)','錄入時間','人工備註',
'圖片原件','原始檔','操作員','Posting Date']
ar_f5['Source']='Other'
ar_f5['Commodity']='Others'
ar_f5['Posting Date']=ar_f5['Posting Date'].apply(datetrim)

ar_f6=pd.read_excel(dz,sheet_name='uncertain', \
dtype=dict.fromkeys(list(pd.read_excel(dz,sheet_name='uncertain').columns),'str')).fillna('')
ar_f6=ar_f6[['批次號','發票唯一代號','條形碼','發票程式碼','發票號碼','開票日期','不含稅金額','發票型別',
'乘車人','錄入時間','錯誤原因','圖片原件','原始檔','操作員','操作日期']]
ar_f6.columns=['批次號','發票唯一代號','條形碼','發票程式碼','發票號碼','開票日期','金額(元)','發票型別',
'乘車人','錄入時間','人工備註','圖片原件','原始檔','操作員','Posting Date']
ar_f6['Source']='Pending'

dic_m={'增值稅普通發票(紙質票)':'Others',
'其他發票':'Others',
'增值稅專用發票':'Others',
'定額髮票':'Others',
'火車票':'Travel Miscellaneous',
'客運汽車':'Travel Miscellaneous',
'計程車發票':'Travel Miscellaneous',
'航空運輸電子客票行程單':'Air'}
ar_f6['Commodity']=ar_f6['發票型別'].map(dic_m)

ar_f6['Posting Date']=ar_f6['Posting Date'].apply(datetrim)

df=pd.concat([ar_f1,ar_f2,ar_f3,ar_f4,ar_f5,ar_f6], axis=0).fillna('')
ar=df.values.tolist()
art=df.columns.tolist()
for i in ar:

k=i[art.index('條形碼')]
if len(k)!=20 or k.isdigit()==False:
i.append('')
i.append('')
i.append('')
i.append('')
i.append('')
else:
i.append(k[:8])
i.append(k[8:])

if i[-2] in dics.keys():
i.append(dics[i[-2]][-8:])
else:
i.append('')
kk='1' + i[-1][-7:]
if i[-1]!='' and kk in dicp.keys():
i.append(dicp[kk][3])
i.append(dicp[kk][5])
else:
i.append('')
i.append('')

art.append('Staff Num')
art.append('Claim Num')
art.append('Requestor PC')
art.append('Requestor SG')
art.append('Requestor Office')

df=pd.DataFrame(ar,columns=art)
return df

####資料加tag
def vat_data_tag(data,dz):
df=pd.read_excel(dz).fillna('')
ar=df.values.tolist()
art=df.columns.tolist()
dic={}
for j,k in enumerate(art):
dic[k]=''
for r in ar:
if r[j]!='':
dic[k]+=(r[j]+'|')
dic[k]=dic[k][:len(dic[k])-1]

#print(dic)
###########篩選出hotel資料
contn_join = dic['hotel contn']#'住宿|房費|場地|房租|房屋|租房|住房|私房出租|場地|租金|會展服務*場地費|經營租賃'
nocontn_join = dic['hotel nocontn']#'洗衣|其他|電話|雜費|洗滌|迷你吧|餐費|餐飲|列印|通行|有形動產|車|會議|體育|車|浙|花卉|禮服|電腦'
data.loc[(data['貨物或應稅勞務、服務名稱'].astype(str).str.contains(contn_join)) \
&(~data['貨物或應稅勞務、服務名稱'].astype(str).str.contains(nocontn_join)) \
&(data['Source']=='VAT'), \
'發票貨物服務型別'] \
='hotel'
#print(data)
####篩選出餐飲資料
# =============================================================================
# contn = '餐飲|餐費|乳製品|食品|飲品|茶歇|奶茶|零食|堅果|水果|飲料|肉|蔬菜|酒|茶|\
# 豆製品|穀物|蛋製品|調味品|水產|畜禽|澱粉|食用菌|植物油|幹豆|\
# 糖|薯豆|發酵類製品|薯類|盒馬|香料原料|鹽|果類|罐頭|咖啡|星巴克|麥當勞|京東|天貓|超市'
# nocontn_join = '傢俱|酒店|住宿|體育用品|休閒用品|化學用品|諮詢服務|醫療|醫藥|印刷品|電池|塑膠製品|\
# 護理品|家用|工藝品|文具|日用|玻璃器皿'
# =============================================================================
contn_join = dic['meal contn']
nocontn_join = dic['meal nocontn']
data.loc[(data['貨物或應稅勞務、服務名稱'].astype(str).str.contains(contn_join)) \
&(~data['貨物或應稅勞務、服務名稱'].astype(str).str.contains(nocontn_join)) \
&(data['Source']=='VAT'), \
'發票貨物服務型別'] \
='meal'
#print(data)
######讀取交通費
####################從服務名稱篩選出客運服務費
# =============================================================================
# contn = '計程車運輸服務|出租汽車客運服務|網約車交通出行費|運輸服務客運服務|計程車客運服務|\
# 代訂車服務費|約車服務費|客運服務費|\*運輸服務\*交通運輸服務|\
# \*運輸服務\*客運服務|\*資訊科技服務\*用車平臺服務費|用車服務費|網約車|\
# \*運輸服務\*出行服務|\*運輸服務\*運輸服務費'
# nocontn_join = '\*運輸服務\*地鐵客運服務費|\*運輸服務\*定製公交客運服務費'
# =============================================================================
contn_join = dic['car contn']
nocontn_join = dic['car nocontn']
data.loc[(data['貨物或應稅勞務、服務名稱'].astype(str).str.contains(contn_join)) \
&(~data['貨物或應稅勞務、服務名稱'].astype(str).str.contains(nocontn_join)) \
&(data['Source']=='VAT'), \
'發票貨物服務型別'] \
='carhailing'
#print(data)
####加油費停車費
data.loc[(data['貨物或應稅勞務、服務名稱'].str.contains(dic['park contn'])) \
&(data['Source']=='VAT') ,\
'發票貨物服務型別']='停車費' #"車輛停放|停車|臨停|泊車|車位"
data.loc[data['貨物或應稅勞務、服務名稱'].str.contains(dic['gas contn']) \
&(data['Source']=='VAT') ,\
'發票貨物服務型別']='加油費' #"加油|汽油|柴油"
###others
data.loc[(data['Source']=='VAT')&(data['發票貨物服務型別'].isna()), \
'發票貨物服務型別']='others'

data.loc[data['Source']!='VAT','發票貨物服務型別']=''
return data


def otherout(dz,ary,dicp,dics): #dzo,
# =============================================================================
# if os.path.exists(dzo)==True:
# os.remove(dzo)
# =============================================================================

dic_out={}

k=0
for yyy in ary:
dz_y=os.path.join(dz,yyy)
ff=os.listdir(dz_y)
for f in ff:
dz_t=os.path.join(dz_y,f)
df=othergen(dz_t,dicp,dics)

#dzoo=os.path.join(dzo,'other' + str(k) + ' ' + f[:f.rindex('.')] + '.xlsx') ##################################
#df.to_excel('testo.xlsx',index=False)

#dftotxt(df,dzoo,k)
#df.to_excel(dzoo,index=False) ##################################

dic_out[str(k) + ' ' + f[:f.rindex('.')] + '.xlsx']=df


#print('other',f,'Outputed') #############################
k+=1

return dic_out


def dftotxt(df,dz,k):

# =============================================================================
# if k==0:
# ar=df.values.tolist()
# art=[df.columns.tolist()]
# arr=art+ar
# else:
# ar=df.values.tolist()
# arr=ar
# =============================================================================
ar=df.values.tolist()
art=[df.columns.tolist()]
arr=art+ar
with open(dz,'w',encoding='UTF-8') as fo:
for i in arr:
k=''
for j in i:
if k=='':
k=str(j)
else:
k+=('//////'+str(j))
k+='\n'
fo.write(k)

def allcombine(dic_av,dic_o,dicsg,dz_o,dz_v):
arsg=[]
for k in dicsg.keys():
arsg.append([k,dicsg[k][0],dicsg[k][1]])
dfsg=pd.DataFrame(arsg,columns=['Staff Num','Staff Grade','Staff name'])

ar_vendor=pd.read_excel(dz_v).fillna('').values.tolist()


for k in dic_o.keys():
if k in dic_av.keys():
df_av=dic_av[k]
df_o=dic_o[k]
df_o.rename(columns={'金額(元)':'CNY'},inplace=True)
df_o.loc[df_o['Source']=='Train','Vendor name']='中國鐵道網路有限公司'
df_o.loc[df_o['Source']!='Train','Vendor name']='No Vendor'

dfa=pd.concat([df_o,df_av])
dfa['CNY']=dfa['CNY'].astype(float)
df=pd.merge(dfa,dfsg,how='left',on='Staff Num')

dz=os.path.join(dz_o,'all' + k)
df.to_excel(dz,index=False)

ar=df['Vendor name'].values.tolist()
for i in ar:
if [i] not in ar_vendor:
ar_vendor.append([i])

print('all',k,'Outputed')

df=pd.DataFrame(ar_vendor,columns=['Vendor name'])
df.to_excel(dz_v,index=False)


bg_dt = datetime.datetime.now()
print('start-----')
#yys=input(r'Please input YEARS separated by comma:')
yys='202107'
ary=yys.strip().split(',')


root_dz=r'G:\Management team(Shared)\Data Analytical\08. Vendor Payment\supporting\Procurement - sitong\4. procurement Python\Ktec'


ktac_dz=os.path.join(root_dz,r'K-tec combine')

#out_dz_air=os.path.join(root_dz,r'air data')
#out_dz_other=os.path.join(root_dz,r'other data')

out_dz_all=os.path.join(root_dz,r'all')

out_dz_vat=os.path.join(root_dz,r'vat')

map_dz=os.path.join(root_dz,r'4.Ktec mapping for python - final.xlsx')
vat_map_dz=os.path.join(root_dz,r'vat type mapping.xlsx')

vendor_map_dz=os.path.join(root_dz,r'vendor mapping.xlsx')


pc_dz=r'G:\Management team(Shared)\Data Analytical\08. Vendor Payment\supporting\Procurement - sitong\4. procurement Python\Ktec\PC CC master (valued).xlsx'
stf_dz=r'\\cnbjsfsr550\fingroup$\VBA\REFERENCES\staff email\KAR staff list.xlsx'

stfg_dz=r'G:\Management team(Shared)\Data Analytical\08. Vendor Payment\supporting\Procurement - sitong\4. procurement Python\Ktec\2024.10_KPI Staff List to Finance.xlsx'
model_dz=r'G:\Management team(Shared)\Data Analytical\08. Vendor Payment\supporting\Procurement - sitong\4. procurement Python\Ktec\max_model using.xlsx'


dic_te={'01 Per diem':'delete',
'02 Trip - Meal':'Meal',
'03 OT meal (non-taxable)':'Meal',
'04 OT meal (taxable)':'Meal',
'05 ENT-client,target&busi asso':'Meal',
'06 ENT-staff entertainment':'Meal',
'07 Hotel&ac w SpecialVATfapiao':'Lodging',
'08 Hotel & accommodation':'Lodging',
'09 Trip-Air-domestic 差旅費-國內機票':'Air',
"10 Trip-Air-int'l 差旅費-國際機票":'Air',
'14 Trip - Airfares':'Air',
'15 Trip - Non airfares':'Travel Miscellaneous',
'16 Trip - HK taxi':'Travel Miscellaneous',
'17Trip-excl. airfare & HK taxi':'Travel Miscellaneous',
'18Gift-client,target&busi asso':'Marketing',
'19 Gift - staff':'Gift',
'20 Membership fee':'Subscriptions & Membership & Sponsorship',
'21 Course & exam fee':'Compensation & Benefits',
'22 Telephone & communication':'Telecommunication & Audio/Visual Conference & Associated Service',
'23 Ref book & publication subs':'Others',
'24 Postage & courier':'Courier, Express & Parcel',
'25 Medical (Partner&Director)':'Compensation & Benefits',
'26 Medical (Grade A to D)':'Compensation & Benefits',
'27 Medical (Support Grade)':'Compensation & Benefits',
'28 Office supplies':'Office Supplies',
'29 HR miscellaneous':'Compensation & Benefits',
'30 OT meal (Meituan)':'Meal',
'31 Trip-Non airfares (Meituan)':'Travel Miscellaneous',
'98 Cash advance':'delete',
'99 Others':'Others'}


ar_special_type=['02 Trip - Meal','03 OT meal (non-taxable)','04 OT meal (taxable)',
'05 ENT-client,target&busi asso','06 ENT-staff entertainment']
######################################################################
clm_dz=os.path.join(root_dz,r'claimmonitoring for ktec')
ar_clm=bwfolderread(clm_dz)
dic_clm_des,dic_clm_num=clmgen(ar_clm,ar_special_type)
######################################################################


dic_pc=pcgen(pc_dz)
#print(dic_pc)
dic_stf=stfgen(stf_dz)

dic_stfg,dic_stfg_m=stfggen(stfg_dz)

df_model=modelread(model_dz)


ar_map=mapgen(map_dz)

dic_vatair=vatairout(ktac_dz,ary,dic_pc,dic_stf,ar_map,vat_map_dz,dic_clm_des,dic_clm_num,dic_stfg_m,df_model,
ar_special_type,dic_te,out_dz_vat) #out_dz_air,
dic_other=otherout(ktac_dz,ary,dic_pc,dic_stf) #out_dz_other,


allcombine(dic_vatair,dic_other,dic_stfg,out_dz_all,vendor_map_dz)


#df_plan['Date']=pd.to_datetime(df_plan['Date']).dt.date

diff=datetime.datetime.now()-bg_dt
diff_s=diff.days*24*60*60 + diff.seconds
print(diff_s)
print('Finished')