Pyspark資料基礎操作集合

ASKED_2019發表於2020-10-27

一 基礎操作

一切操作之前需要先建立一個SparkSession物件(執行Spark code的Entrance point,可以理解為互動部件):

from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('mu').master('local').getOrCreate()

1.1 建立DataFrame

df = spark.read.parquet(parquet_file)
df = spark.read.csv(csv_file)
df = spark.read.json(json_file)

df = spark.createDataFrame(RDD, schema)
df = rdd.toDF(*cols)
#與Pandas DataFrame之間的相互轉換:
df = spark.createDataFrame(pandas_df)
pandas_df = df.toPandas()

檢視資料情況:

df = spark.createDataFrame([('1', 'Joe', '70000', '1'), ('2', 'Henry', '80000', None)],
                           ['Id', 'Name', 'Sallary', 'DepartmentId'])
# 1.檢視title
df.printSchema()
# 輸出
root
 |-- Id: string (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sallary: string (nullable = true)
 |-- DepartmentId: string (nullable = true)

# 2.檢視資料基本統計情況
df.describe().show()
# 輸出
+-------+------------------+-----+-----------------+------------+
|summary|                Id| Name|          Sallary|DepartmentId|
+-------+------------------+-----+-----------------+------------+
|  count|                 2|    2|                2|           1|
|   mean|               1.5| null|          75000.0|         1.0|
| stddev|0.7071067811865476| null|7071.067811865475|         NaN|
|    min|                 1|Henry|            70000|           1|
|    max|                 2|  Joe|            80000|           1|
+-------+------------------+-----+-----------------+------------+

# 3.檢視資料列
df.columns
# 輸出
['Id', 'Name', 'Sallary', 'DepartmentId']

# 4.檢視行數
df.count()
# 輸出
2

# 5.檢視各列非空記錄數量
from pyspark.sql.functions import count
df.agg(*[count(c).alias(c) for c in df.columns]).show()
# 輸出
+---+----+-------+------------+
| Id|Name|Sallary|DepartmentId|
+---+----+-------+------------+
|  2|   2|      2|           1|
+---+----+-------+------------+

1.2 DataFrame基礎操作

1.2.1 資料的篩選

  • 列的篩選
  • 行的篩選
  • 聯合篩選
  • 其他高階操作
# 列的選擇通過select實現
df.select('Id').show()
# 輸出
+---+
| Id|
+---+
|  1|
|  2|
+---+
# 多列篩選
df.select('Id','Name').show()
# 輸出
+---+-----+
| Id| Name|
+---+-----+
|  1|  Joe|
|  2|Henry|
+---+-----+

# 使用列序號篩選
df.select(df.columns[0:2]).show()
# 輸出
+---+---+
|  a|  b|
+---+---+
|1.0|NaN|
|NaN|2.0|
+---+---+

# 行篩選,通過filter或者直接[]加條件,後者與pandas中操作一致
df.filter((df['Id']==1)&(df['Name']=='Joe')).show()
df[(df['Id']==1)&(df.Name=='Joe')].show()
# 輸出
+---+----+-------+------------+
| Id|Name|Sallary|DepartmentId|
+---+----+-------+------------+
|  1| Joe|  70000|           1|
+---+----+-------+------------+

# 高階操作:分層取樣 df.sampleBy(col, fractions, seed=None),根據某一列類別來進行抽樣,用來進行分層抽樣
df = spark.createDataFrame([('a',1),('a',2),('a',3),('a',1),('b',1),('b',2)],['key', 'val'])
df.sampleBy('key', fractions={'a':0.2,'b':0.6}, seed=123).show()
+---+---+
|key|val|
+---+---+
|  a|  1|
|  b|  1|
|  b|  2|
+---+---+

1.2.2 增加、刪除、修改列

# 1.刪除一列
df.drop('length').show()

# 2.修改列使用withColumn
## 先進行修改列,注意到Sallary為string型別,所以先修改為int
root
 |-- Id: string (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sallary: string (nullable = true)
 |-- DepartmentId: string (nullable = true)
    
df = df.withColumn('Sallary',df['Sallary'].cast('int'))
df.printSchema()
## 輸出,可以看到Sallary型別已經修改
root
 |-- Id: string (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sallary: integer (nullable = true)
 |-- DepartmentId: string (nullable = true)
    
## 判斷Sallary是否大於1000,如果是就為1,不是就為0
### 方法1 先生成Bool值然後轉換型別
df.withColumn('Sallary',(df['Sallary']>1000).cast('int')).show()
### 方法2 When函式,更加靈活
from pyspark.sql.functions import when
df = df.withColumn('Sallary',when(df['Sallary']>1000,1).otherwise(0))
df.show()
## 輸出
+---+-----+-------+------------+
| Id| Name|Sallary|DepartmentId|
+---+-----+-------+------------+
|  1|  Joe|      1|           1|
|  2|Henry|      1|        null|
+---+-----+-------+------------+

## 將str列中數字替換為--
from pyspark.sql.functions import regexp_replace
df = spark.createDataFrame([('100-200',)], ['str'])
df.show()
df.select(regexp_replace('str', r'(\d+)', '--').alias('d').show()
+-------+
|    str|
+-------+
|100-200|
+-------+

+-----+
|    d|
+-----+
|-----|
+-----+

# 3.增加列仍然才用withColumn方法
## 增加一列value全為0的列
from pyspark.sql.functions import lit
df.withColumn('newCol', lit(0)).show()
## 輸出
+---+-----+-------+------------+------+
| Id| Name|Sallary|DepartmentId|newCol|
+---+-----+-------+------------+------+
|  1|  Joe|  70000|           1|     0|
|  2|Henry|  80000|        null|     0|
+---+-----+-------+------------+------+

## 增加列可為兩列的運算值
from pyspark.sql.functions import lit
df = df.withColumn('newCol', lit(1))
df.withColumn('new',df['Sallary']+df['newCol']).show()
## 輸出
+---+-----+-------+------------+------+-----+
| Id| Name|Sallary|DepartmentId|newCol|  new|
+---+-----+-------+------------+------+-----+
|  1|  Joe|  70000|           1|     1|70001|
|  2|Henry|  80000|        null|     1|80001|
+---+-----+-------+------------+------+-----+

##增加一列為new,當Id為1則兩列字串用 '_'連線,採用自定義函式方法新增新列
from pyspark.sql import functions as F
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

def cho(x,y):
    return str(x) + '_' + str(y)
cho1 = udf(cho,StringType())

df.withColumn('new',F.when(df['id']==1,cho1(df.Id,df.Name)).otherwise(df['Id']).alias('new')).show()

## 輸出
+---+-----+-------+------------+-----+
| Id| Name|Sallary|DepartmentId|  new|
+---+-----+-------+------------+-----+
|  1|  Joe|  70000|           1|1_Joe|
|  2|Henry|  80000|        null|    2|
+---+-----+-------+------------+-----+

##某些列是自帶一些常用的方法的
df1.withColumn('Initial', df1.LastName.substr(1,1)).show()


# 4.修改列名
data = spark.createDataFrame([("Alberto", 2), ("Dakota", 2)], 
                                  ["Name", "askdaosdka"])
data.show()
data.printSchema()
# 輸出
+-------+----------+
|   Name|askdaosdka|
+-------+----------+
|Alberto|         2|
| Dakota|         2|
+-------+----------+

root
 |-- Name: string (nullable = true)
 |-- askdaosdka: long (nullable = true)

## 方法1:selectExpr
df = data.selectExpr("Name as name", "askdaosdka as age")
df.show()
df.printSchema()
## 輸出
+-------+---+
|   name|age|
+-------+---+
|Alberto|  2|
| Dakota|  2|
+-------+---+

root
 |-- name: string (nullable = true)
 |-- age: long (nullable = true)
    
## 方法2:alias
from pyspark.sql.functions import col

data = data.select(col("Name").alias("name"), col("askdaosdka").alias("age"))
data.show()
## 輸出
+-------+---+
|   name|age|
+-------+---+
|Alberto|  2|
| Dakota|  2|
+-------+---+

## 方法3:當列比較多的時候,可以直接使用withColumnRenamed & reduce方法
from functools import reduce

oldColumns = data.schema.names
newColumns = ["name", "age"]

df = reduce(lambda data, idx: data.withColumnRenamed(oldColumns[idx], newColumns[idx]), range(len(oldColumns)), data)
df.printSchema()
df.show()
## 輸出
root
 |-- name: string (nullable = true)
 |-- age: long (nullable = true)

+-------+---+
|   name|age|
+-------+---+
|Alberto|  2|
| Dakota|  2|
+-------+---+
## notice that this method allows you to "overwrite" the same column

1.2.3 排序

df.sort('Sallary',ascending=False)
# 輸出
+---+-----+-------+------------+------+
| Id| Name|Sallary|DepartmentId|newCol|
+---+-----+-------+------------+------+
|  2|Henry|  80000|        null|     1|
|  1|  Joe|  70000|           1|     1|
+---+-----+-------+------------+------+

# 多欄位排序
color_df.filter(color_df['length']>=4)\
        .sort('length', 'color', ascending=False).show()

# 混合排序
color_df.sort(color_df.length.desc(), color_df.color.asc()).show()

# orderBy也是排序,返回的Row物件列表
color_df.orderBy('length','color').take(4)

1.2.4 去重

authors = [['Thomas','Hardy','June 2,1840'],
            ['Thomas','Hardy','June 2,1840'],
            ['Thomas','H',None],
            ['Jane','Austen','16 December 1775'],
            ['Emily',None,None]]

df1 = spark.createDataFrame(authors,schema=["FirstName","LastName","Dob"])
df1.show()
# 輸出
+---------+--------+----------------+
|FirstName|LastName|             Dob|
+---------+--------+----------------+
|   Thomas|   Hardy|     June 2,1840|
|   Thomas|   Hardy|     June 2,1840|
|   Thomas|       H|            null|
|     Jane|  Austen|16 December 1775|
|    Emily|    null|            null|
+---------+--------+----------------+


# 刪除重複值行
df1.dropDuplicates().show()
# 輸出
+---------+--------+----------------+
|FirstName|LastName|             Dob|
+---------+--------+----------------+
|     Jane|  Austen|16 December 1775|
|    Emily|    null|            null|
|   Thomas|   Hardy|     June 2,1840|
|   Thomas|       H|            null|
+---------+--------+----------------+

# 只要某一列有重複值,則去重
df1.dropDuplicates(subset=['FirstName']).show()
# 輸出
+---------+--------+----------------+
|FirstName|LastName|             Dob|
+---------+--------+----------------+
|    Emily|    null|            null|
|     Jane|  Austen|16 December 1775|
|   Thomas|   Hardy|     June 2,1840|
+---------+--------+----------------+

1.2.5 空值的判斷與處理

有兩種空值判斷,一種是數值型別是nan,另一種是普通的None

# 類似 pandas.isnull

from pyspark.sql.functions import isnull, isnan

# 1.None 的空值判斷
df = spark.createDataFrame([(1, None), (None, 2)], ("a", "b"))
df.select(isnull("a").alias("r1"), isnull(df.a).alias("r2")).show()
# 輸出
+-----+-----+
|   r1|   r2|
+-----+-----+
|false|false|
| true| true|
+-----+-----+

# 2.nan的空值判斷
df = spark.createDataFrame([(1.0, float('nan')), (float('nan'), 2.0)], ("a", "b"))
df.select(isnan("a").alias("r1"), isnan(df.a).alias("r2")).show()
# 輸出
+-----+-----+
|   r1|   r2|
+-----+-----+
|false|false|
| true| true|
+-----+-----+

df.na.drop 或者df.dropna()方法來丟掉空值行,使用df.na.fill 或者df.fillna()方法來使用某些值來替換空值

# 1.刪除缺失值行
df.na.drop()
df.dropna()
# 如果一行至少2個缺失值才刪除該行
df.na.drop(thresh=2).show()

# 2.填充缺失值
## 用均值替換缺失值
import math
from pyspark.sql import functions as F  # 匯入spark內建函式
# 計算缺失值,collect()函式將資料返回到driver端,為Row物件,[0]可以獲取Row的值
mean_salary = df.select(F.mean('salary')).collect()[0][0]
clean_data = df.na.fill({'salary':mean_salary})
## 此方法中求均值可以使用numpy
import numpy 
mean_salary = np.mean(df.select('salary').collect())

##對所有列用同一個值填充缺失值
df.fillna('unknown').show()
 
##不同的列用不同的值填充
df.na.fill({'LastName':'--', 'Dob':'unknown'}).show()

1.2.6 資料聯結

使用join進行列合併:

df1 = spark.createDataFrame([('a',1),('b',2),('c',3)],['x1','x2'])
+---+---+
| x1| x2|
+---+---+
|  a|  1|
|  b|  2|
|  c|  3|
+---+---+
df2 = spark.createDataFrame([('a','T'),('b','F'),('d','T')],['x1','x3'])

+---+---+
| x1| x3|
+---+---+
|  a|  T|
|  b|  F|
|  d|  T|
+---+---+

## pyspark使用join進行聯結
## 預設為inner join
df1.join(df2,on='x1').show()
## 輸出,可以看出,做join後資料沒有按原順序進行排序
+---+---+---+
| x1| x2| x3|
+---+---+---+
|  b|  2|  F|
|  a|  1|  T|
+---+---+---+
## 另外需要注意的是如果兩個df除了合併標識列之外仍然有相同列名,那最終結果會存在重複列名
# 如果是pandas,重複列會用_x,_y等字尾標識出來,但spark不會
# join會在最後的dataframe中存在重複列

## 另外常見的還有left_outer 和outer

​ 除了join方法外,合併行操作:

df1 = spark.createDataFrame([('a',1),('b',2),('c',3)],['x1','x2'])
df2 = spark.createDataFrame([('a','T'),('b','F'),('d','T')],['x1','x2'])
df1.show();df2.show()

+---+---+
| x1| x2|
+---+---+
|  a|  1|
|  b|  2|
|  c|  3|
+---+---+

+---+---+
| x1| x2|
+---+---+
|  a|  T|
|  b|  F|
|  d|  T|
+---+---+

# union 並集;intersect 交集 ;substract 減集
print('union:')
df1.union(df2).orderBy('x1', ascending=True).show()
# 輸出
union:
+---+---+
| x1| x2|
+---+---+
|  a|  1|
|  a|  T|
|  b|  2|
|  b|  F|
|  c|  3|
|  d|  T|
+---+---+

print('intersect:')
df1.intersect(df2).orderBy('x1', ascending=True).show()
# 輸出
intersect:
+---+---+
| x1| x2|
+---+---+
+---+---+

print('subtract:')
df1.union(df2).subtract(df1).orderBy('x1', ascending=True).show()
# 輸出
subtract:
+---+---+
| x1| x2|
+---+---+
|  a|  T|
|  b|  F|
|  d|  T|
+---+---+

1.2.7 其他行列相關高階操作

# (1) 生成透視表

1.2.8 GroupBy

(1) GroupBy基本操作

# 分組計算1
df.groupBy('step_id').count().show()

# 分組計算2:agg
unit_qty = df.groupBy(['step_id','equip_id']).agg({'unit_id':'count'})

# 分組計算3:應用多函式
import pyspark.sql.functions as F
unit_qty = df.groupBy(['step_id','equip_id']).agg(F.countDistinct('unit_id').alias('unit_qty'),
                                                  F.mean('label').alias('ratio'))
unit_qty.show()
+-------+--------+--------+-------------------+
|step_id|equip_id|unit_qty|              ratio|
+-------+--------+--------+-------------------+
|  52000|C2PAS200|       1| 0.1917808219178082|
|  52000|C2PAS100|       1|0.05714285714285714|
|  51106|C2SRP100|       3|0.16039279869067102|
|  51000|C2PIL100|       1|  0.140084388185654|
|  53000|C2ODF100|       1|0.14112554112554113|
+-------+--------+--------+-------------------+

(2) 與pandas的結合

除了原生操作外,pyspark的groupBy可以與pandas結合進行更加複雜的運算。

有兩種語言模式,第一種是利用pandas_udf和apply()或者agg(),

from pyspark.sql.fuctions import pandas_udf,PandasUDFType

df = spark.createDataFrame(
    [(1, 1.0), (1, 2.0), (2, 3.0), (2, 5.0), (2, 10.0)],
    ("id", "v"))
## GROUPED_MAP&apply
@pandas_udf('id long,v double,c double',PandasUDFType.GROUPED_MAP)
def sum_udf(pdf):
    v = pdf.v
    return pdf.assign(c=v.sum())
df.groupby("id").apply(sum_udf).show()
## 輸出
+---+----+----+
| id|   v|   c|
+---+----+----+
|  1| 1.0| 3.0|
|  1| 2.0| 3.0|
|  2| 3.0|18.0|
|  2| 5.0|18.0|
|  2|10.0|18.0|
+---+----+----+
## 此種方法schema較長時可以

## GROUPED_AGG&agg
@pandas_udf('double',PandasUDFType.GROUPED_AGG)
def mean_udf(v):
    return v.mean()

df.groupby("id").agg(mean_udf(df['v']).alias('c')).show()
## 輸出
+---+---+
| id|  c|
+---+---+
|  1|1.5|
|  2|6.0|
+---+---+

## Window 類似pandas transform可以分組計算新增列
from pyspark.sql import Window
w = Window \
    .partitionBy('id') \
    .rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing)
df.withColumn('mean_v', mean_udf(df['v']).over(w)).show()
## 輸出
# +---+----+------+
# | id|   v|mean_v|
# +---+----+------+
# |  1| 1.0|   1.5|
# |  1| 2.0|   1.5|
# |  2| 3.0|   6.0|
# |  2| 5.0|   6.0|
# |  2|10.0|   6.0|
# +---+----+------+
# reference:https://www.cnblogs.com/wkang/p/10255043.html

另一種是註冊函式結合applyInPandas進行,個人而言更加偏向此種語法結構:

import pandas as pd
from pyspark.sql.functions import pandas_udf
df = df.withColumn("label",df["label"].cast("double"))
df.printSchema()

def eqp_label(pdf):
    label = pdf.label
    step_id = pdf.step_id
    equip_id = pdf.equip_id
    return pdf.assign(step_id=step_id,step_ng=label.max(),equip_step=equip_id.nunique())

ng_all = df.select(
    'step_id','equip_id','label'
         ).groupBy("step_id").applyInPandas(eqp_label,schema="step_id string,\
                                     equip_id string,\
                                     label double,\
                                     step_ng double,\
                                     equip_step double")

## 利用上述語法,schema無法丟棄原資料列,其實只要返回的是DataFrame都可以利用applyInPandas
def choose(pdf):
    equip = pdf.equip_id
    Nm = pdf.groupby(['step_id',equip]).apply(lambda x:x['glass_id'].nunique())
    Nm = pd.DataFrame(Nm).reset_index()
    Nm.columns = ['step_id','equip_id','nn']
    return Nm

dattmp.groupBy('step_id').applyInPandas(choose,schema="step_id string,\
                                             equip_id string,\
                                             nn double").show()
+-------+--------+------+
|step_id|equip_id|    nn|
+-------+--------+------+
|  53000|C2ODF100|1155.0|
|  51106|C2SRP100| 611.0|
|  52000|C2PAS100| 455.0|
|  52000|C2PAS200| 730.0|
|  51000|C2PIL100|1185.0|
+-------+--------+------+
## 注內部巢狀還可以使用自定義函式,由此可以完成更加複雜的操作,可以自定義傳參。
def datfilter(self,dat,leaf,label):# 此函式無需細究,完全利用pandas
        Nm=dat.groupby(leaf)[self.glass].count()
        Nm=Nm/Nm.sum()
        dat.loc[:,label]=1*(dat[self.label]>self.lev)
        bNm=dat.groupby(leaf)[label].sum()
        bNm=bNm/bNm.sum()
        Nm=Nm[(Nm>=0.1)|(bNm==1)].reset_index()
        retdat=pd.Series([np.nan,np.nan],index=['value','IV'])
        if Nm.shape[0]>1:
            datmp=pd.merge(dat,Nm[[leaf]],on=leaf,how='inner')
            retdat=self.chi2IV(datmp,leaf,label)
        return retdat

@staticmethod    
def choose(self,pdf):
        step = pdf.step_id
        equip = pdf.equip_new
        label = pdf.label
        datfilter = self.datfilter
        result = pdf.groupby(step).apply(datfilter,equip,label)
        result = pd.DataFrame(result).reset_index()
        result.columns = ['step_id','value','IV']
        return result
    
stepdat = datuse.groupBy(self.step).applyInPandas(self.choose,schema="step_id long,\
                                                        p_value float,\
                                                        IV float")

**需要注意的是: 在類中使用自定函式,需將函式定義為靜態函式, 在函式前新增@staticmethod裝飾器即可 **

相關文章