import pandas as pd from pyspark.sql import SparkSession from pyspark.context import SparkContext from pyspark.sql.functions import *from pyspark.sql.types import *from datetime import date, timedelta, datetime import time
sc = SparkSession.builder.appName("PysparkExample")\ .config ("spark.sql.shuffle.partitions", "50")\ .config("spark.driver.maxResultSize","5g")\ .config ("spark.sql.execution.arrow.enabled", "true")\ .getOrCreate()
你可以從https://www.kaggle.com/cmenca/new-york-times-hardcover-fiction-best-sellers中下載Kaggle資料集。
DataFrame可以通過讀txt,csv,json和parquet檔案格式來建立。在本文的例子中,我們將使用.json格式的檔案,你也可以使用如下列舉的相關讀取函式來尋找並讀取text,csv,parquet檔案格式。
#Creates a spark data frame called as raw_data. #JSON dataframe = sc.read.json('dataset/nyt2.json') #TXT FILES# dataframe_txt = sc.read.text('text_data.txt') #CSV FILES# dataframe_csv = sc.read.csv('csv_data.csv') #PARQUET FILES# dataframe_parquet = sc.read.load('parquet_data.parquet')
dataframe = sc.read.json('dataset/nyt2.json') dataframe.show(10)
dataframe_dropdup = dataframe.dropDuplicates() dataframe_dropdup.show(10)
#Show all entries in title column dataframe.select("author").show(10) #Show all entries in title, author, rank, price columns dataframe.select("author", "title", "rank", "price").show(10)
第一個結果表格展示了“author”列的查詢結果,第二個結果表格展示多列查詢。
# Show title and assign 0 or 1 depending on title dataframe.select("title",when(dataframe.title != 'ODD HOURS', 1).otherwise(0)).show(10)
在第二個例子中,應用“isin”操作而不是“when”,它也可用於定義一些針對行的條件。
# Show rows with specified authors if in the given options dataframe [dataframe.author.isin("John Sandford", "Emily Giffin")].show(5)
5.3、“Like”操作
# Show author and title is TRUE if title has " THE " word in titles dataframe.select("author", "title", dataframe.title.like("% THE %")).show(15)
5.4、“startswith”-“endswith”
dataframe.select("author", "title", dataframe.title.startswith("THE")).show(5) dataframe.select("author", "title", dataframe.title.endswith("NT")).show(5)
5.5、“substring”操作
dataframe.select(dataframe.author.substr(1 , 3).alias("title")).show(5) dataframe.select(dataframe.author.substr(3 , 6).alias("title")).show(5) dataframe.select(dataframe.author.substr(1 , 6).alias("title")).show(5)
6、增加,修改和刪除列
# Lit() is required while we are creating columns with exact values. dataframe = dataframe.withColumn('new_column', F.lit('This is a new column')) display(dataframe)
# Update column 'amazon_product_url' with 'URL' dataframe = dataframe.withColumnRenamed('amazon_product_url', 'URL') dataframe.show(5
6.3、刪除列
dataframe_remove = dataframe.drop("publisher", "published_date").show(5) dataframe_remove2=dataframe \ .drop(dataframe.publisher).drop(dataframe.published_date).show(5)
# Returns dataframe column names and data types dataframe.dtypes # Displays the content of dataframe dataframe.show() # Return first n rows dataframe.head() # Returns first row dataframe.first() # Return first n rows dataframe.take(5) # Computes summary statistics dataframe.describe().show() # Returns columns of dataframe dataframe.columns # Counts the number of rows in dataframe dataframe.count() # Counts the number of distinct rows in dataframe dataframe.distinct().count() # Prints plans including physical and logical dataframe.explain(4)
# Group by author, count the books of the authors in the groups dataframe.groupBy("author").count().show(10)
作者被以出版書籍的數量分組
# Filtering entries of title # Only keeps records having value 'THE HOST' dataframe.filter(dataframe["title"] == 'THE HOST').show(5)
標題列經篩選後僅存在有“THE HOST”的內容,並顯示5個結果
# Replacing null values dataframe.na.fill() dataFrame.fillna() dataFrameNaFunctions.fill() # Returning new dataframe restricting rows with null valuesdataframe.na.drop() dataFrame.dropna() dataFrameNaFunctions.drop() # Return new dataframe replacing one value with another dataframe.na.replace(5, 15) dataFrame.replace() dataFrameNaFunctions.replace()
# Dataframe with 10 partitions dataframe.repartition(10).rdd.getNumPartitions() # Dataframe with 1 partition dataframe.coalesce(1).rdd.getNumPartitions()
# Registering a table dataframe.registerTempTable("df") sc.sql("select * from df").show(3) sc.sql("select \ CASE WHEN description LIKE '%love%' THEN 'Love_Theme' \ WHEN description LIKE '%hate%' THEN 'Hate_Theme' \ WHEN description LIKE '%happy%' THEN 'Happiness_Theme' \ WHEN description LIKE '%anger%' THEN 'Anger_Theme' \ WHEN description LIKE '%horror%' THEN 'Horror_Theme' \ WHEN description LIKE '%death%' THEN 'Criminal_Theme' \ WHEN description LIKE '%detective%' THEN 'Mystery_Theme' \ ELSE 'Other_Themes' \ END Themes \ from df").groupBy('Themes').count().show()
# Converting dataframe into an RDD rdd_convert = dataframe.rdd # Converting dataframe into a RDD of string dataframe.toJSON().first() # Obtaining contents of df as Pandas dataFramedataframe.toPandas()
不同資料結構的結果
# Write & Save File in .parquet format dataframe.select("author", "title", "rank", "description") \ .write \ .save("Rankings_Descriptions.parquet")
# Write & Save File in .json format dataframe.select("author", "title") \ .write \ .save("Authors_Titles.json",format="json")
原文標題:
原文連結: