菜市場價格分析 python pandas Apriori演算法 資料預處理

Pikathu_zp發表於2019-03-30

1. 安裝python包

  1.  numpy
  2.  pandas

2. 資料預處理

     1. 由於表格非常奇怪,我們對蔬菜和肉類分別處理,併合並

      2.  對有丟失值的列直接drop

3. 編碼分析


# coding: utf-8

# In[1]:


# coding = utf-8
import numpy as np
import pandas as pd

data = pd.read_excel("data.xls",encoding='utf-8')#.astype('float')
data.drop(["肉食禽蛋","批發價格"],axis=1,inplace=True)
data=data.drop_duplicates().dropna()
data=data.dropna(axis=1)
index2drop = data["蔬菜名"]!="蔬菜類"
data=data[index2drop]
data = data.pivot(index='日期',columns='蔬菜名',values='價格')

data2 = pd.read_excel("data.xls")#.astype('float')
data2.drop(["蔬菜名","價格"],axis=1,inplace=True)
data2=data2.drop_duplicates().dropna()
index2drop = data2["肉食禽蛋"]!="肉食禽蛋類"
data2=data2[index2drop]
data2 = data2.pivot(index='日期',columns='肉食禽蛋',values='批發價格')
data2.head(6)
data = pd.merge(data,data2,on="日期")
data.head(10)
data.drop(["凍芋頭","北瓜"],axis=1,inplace = True)
data.dropna(thresh=10,axis=1,inplace=True)
data.head()
data.dtypes
# data.to_csv("data1.csv",encoding='utf-8')
#
# # help(pd.concat)
# data=pd.concat([data,data2],keys="日期")
# data.head(100)
# data = pd.merge(data,data2,left_on="日期")


# # 在excel中開啟data1.csv
# ## 將亂碼的缺失值替換成空 (ctrl + F)
# > 若出現讀不進來的情況,把csv檔案的字尾改為txt,用記事本開啟,然後另存為[utf-8編碼](https://jingyan.baidu.com/article/9faa7231ae80c2473c28cb85.html),然後把字尾名該回去

# In[2]:


columns = data.columns
print(columns)
# columns=columns.insert(0,"date")
print(columns.shape)
data2=pd.read_csv("data1.csv",index_col ="日期",encoding = 'utf-8').astype('float')
data2.columns = columns
data2.describe()


# In[19]:


temp = data2.isnull().any() #列中是否存在空值
dataDropped = data2.dropna(axis=1)


# ## there are 61 kinds goods in all, and 38 kinds are complete.
# ## So, we drop the columns with nan values..

# In[20]:


dataDroppedNameList = dataDropped.columns


# In[21]:


dataDroppedNameList


# # diff and encode
# 1. Get the differenec DataFrame
# 2. Encode the increasing items with c*2, and the decreasing items with c*2+1,where c is the columns number
# 

# In[22]:


dataDiff = dataDropped.diff()


# In[23]:


dataDiff.head(10)


# In[24]:


# encode
dataEncode = dataDiff.copy()
for i in range(len(dataDroppedNameList)):
    increaingIndex = dataEncode[dataDroppedNameList[i]]>0
    dataEncode[dataDroppedNameList[i]][increaingIndex] = float(i*2)
    decreaingIndex = dataEncode[dataDroppedNameList[i]]<0
    dataEncode[dataDroppedNameList[i]][decreaingIndex] = float(i*2+1)


# In[42]:


dataEncode.drop(['2008/9/29'],inplace=True)
dataEncode=dataEncode.astype('int')


# In[33]:


dataEncode=dataEncode.T
dataEncode.to_excel("Encode.xlsx")


# In[43]:


print(dataEncode)


# In[60]:


# list = [1,2,3,0,2,0,0,303]


import numpy as np
import pandas as pd

def zerofilter(listArr):
    ans = []
    for i in listArr:
        if i!=0:
            ans.append(i)
    return ans

def loadDataSet():

    data = pd.read_excel("Encode.xlsx")
    input =[]
    for i in data.columns:
        dataRow = list(data[i])
        dataRow = zerofilter(dataRow)
        input.append(dataRow)
# print(input)
    return input#[[1,2,5],[2,4],[2,3],[1,2,4],[1,3],[2,3],[1,3],[1,2,3,5],[1,2,3]]
#1.構建候選1項集C1
def createC1(dataSet):
    C1 = []
    for transaction in dataSet:
        for item in transaction:
            if not [item] in C1:
                C1.append([item])

    C1.sort()
    return list(map(frozenset, C1))

#將候選集Ck轉換為頻繁項集Lk
#D:原始資料集
#Cn: 候選集項Ck
#minSupport:支援度的最小值
def scanD(D, Ck, minSupport):
    #候選集計數
    ssCnt = {}
    for tid in D:
        for can in Ck:
            if can.issubset(tid):
                if can not in ssCnt.keys(): ssCnt[can] = 1
                else: ssCnt[can] += 1

    numItems = float(len(D))
    Lk= []     # 候選集項Cn生成的頻繁項集Lk
    supportData = {}    #候選集項Cn的支援度字典
    #計算候選項集的支援度, supportData key:候選項, value:支援度
    for key in ssCnt:
        support = ssCnt[key] / numItems
        if support >= minSupport:
            Lk.append(key)
        supportData[key] = support
    return Lk, supportData

#連線操作,將頻繁Lk-1項集通過拼接轉換為候選k項集
def aprioriGen(Lk_1, k):
    Ck = []
    lenLk = len(Lk_1)
    for i in range(lenLk):
        L1 = list(Lk_1[i])[:k - 2]
        L1.sort()
        for j in range(i + 1, lenLk):
            #前k-2個項相同時,將兩個集合合併
            L2 = list(Lk_1[j])[:k - 2]
            L2.sort()
            if L1 == L2:
                Ck.append(Lk_1[i] | Lk_1[j])

    return Ck

def apriori(dataSet, minSupport = 0.5):
    C1 = createC1(dataSet)
    L1, supportData = scanD(dataSet, C1, minSupport)
    L = [L1]
    k = 2
    while (len(L[k-2]) > 0):
        Lk_1 = L[k-2]
        Ck = aprioriGen(Lk_1, k)
#         print("ck:",Ck)
        Lk, supK = scanD(dataSet, Ck, minSupport)
        supportData.update(supK)
        print("lk:", Lk)
        L.append(Lk)
        k += 1

    return L, supportData

def generateRules(L, supportData, minConf=0.7):
    #頻繁項集列表、包含那些頻繁項集支援資料的字典、最小可信度閾值
    bigRuleList = [] #儲存所有的關聯規則
    for i in range(1, len(L)):  #只獲取有兩個或者更多集合的專案,從1,即第二個元素開始,L[0]是單個元素的
        # 兩個及以上的才可能有關聯一說,單個元素的項集不存在關聯問題
        for freqSet in L[i]:
            H1 = [frozenset([item]) for item in freqSet]
            #該函式遍歷L中的每一個頻繁項集並對每個頻繁項集建立只包含單個元素集合的列表H1
            if (i > 1):
            #如果頻繁項集元素數目超過2,那麼會考慮對它做進一步的合併
                rulesFromConseq(freqSet, H1, supportData, bigRuleList, minConf)
            else:#第一層時,後件數為1
                calcConf(freqSet, H1, supportData, bigRuleList, minConf)# 呼叫函式2
    return bigRuleList

#生成候選規則集合:計算規則的可信度以及找到滿足最小可信度要求的規則
def getinfo(num):
    good = dataDroppedNameList[int(num/2)]
    up = num%2
    ans = good 
    if up:
        ans = ans+"  down"
    else:
        ans = ans+"  up"
    return  ans
def calcConf(freqSet, H, supportData, brl, minConf=0.7):
    #針對項集中只有兩個元素時,計算可信度
    prunedH = []#返回一個滿足最小可信度要求的規則列表
    for conseq in H:#後件,遍歷 H中的所有項集並計算它們的可信度值
        conf = supportData[freqSet]/supportData[freqSet-conseq] #可信度計算,結合支援度資料
        if conf >= minConf:
#             print (freqSet-conseq,'-->',conseq,'conf:',conf)
            cause = list(freqSet-conseq)
#             print(cause[0])
            result = list(conseq)
#             print(result[0])
#             print('-----')
            print(getinfo(cause[0]),"--->>>",getinfo(result[0]))
#             print(type(freqSet-conseq))
            #如果某條規則滿足最小可信度值,那麼將這些規則輸出到螢幕顯示
            brl.append((freqSet-conseq, conseq, conf))#新增到規則裡,brl 是前面通過檢查的 bigRuleList
            prunedH.append(conseq)#同樣需要放入列表到後面檢查
    return prunedH

#合併
def rulesFromConseq(freqSet, H, supportData, brl, minConf=0.7):
    #引數:一個是頻繁項集,另一個是可以出現在規則右部的元素列表 H
    m = len(H[0])
    if (len(freqSet) > (m + 1)): #頻繁項集元素數目大於單個集合的元素數
        Hmp1 = aprioriGen(H, m+1)#存在不同順序、元素相同的集合,合併具有相同部分的集合
        Hmp1 = calcConf(freqSet, Hmp1, supportData, brl, minConf)#計算可信度
        if (len(Hmp1) > 1):    
        #滿足最小可信度要求的規則列表多於1,則遞迴來判斷是否可以進一步組合這些規則
            rulesFromConseq(freqSet, Hmp1, supportData, brl, minConf)
dataset = loadDataSet()
L, supportData = apriori(dataset, minSupport=0.05)
print("------------------------")
# print(L)
print("----------generateRules-----------")
rules=generateRules(L,supportData,minConf=0.4)


# In[50]:


# Python program to understand frozenset() function 

# tuple of numbers 
nu = (1, 2, 3, 4, 5, 6, 7, 8, 9) 

# converting tuple to frozenset 
fnum = frozenset(nu) 

# printing details 
print("frozenset Object is : ", fnum) 
list(fnum)

 

相關文章