是的 我完成了一個 簡單的 web +spark+hive+hadoop的基本的增查功能
下面是 後端程式碼
採用的是 pyspark 這比 Hadoop的mapreduce 快很多
另外 我部署的是 遠端直譯器
from flask import Flask, jsonify, request
from pyspark.sql import SparkSession
import os
from flask_cors import CORS
app = Flask(__name__)
CORS(app)
os.environ['HADOOP_CONF_DIR'] = '/export/server/hadoop-3.3.0/etc/hadoop'
os.environ['YARN_CONF_DIR'] = '/export/server/hadoop-3.3.0/etc/hadoop'
# 配置SparkSession,連線到Hive,並指定資料庫
spark = SparkSession.builder \
.appName("Flask with PySpark and Hive") \
.master("yarn") \
.config("spark.sql.warehouse.dir", "hdfs://10.0.0.129:8020/user/hive/warehouse") \
.enableHiveSupport() \
.getOrCreate()
# 使用 `itheima` 資料庫
spark.sql("USE itheima")
# 增:將資料插入到Hive表中
@app.route('/create', methods=['POST'])
def create_data():
try:
data = request.json
if not isinstance(data, dict):
return jsonify({"error": "Invalid data format. Expected a JSON object."}), 400
required_fields = ['id', 'name', 'age']
if not all(field in data for field in required_fields):
return jsonify({"error": "Missing required fields"}), 400
# 資料型別驗證
if not isinstance(data['id'], int) or not isinstance(data['name'], str) or not isinstance(data['age'], int):
return jsonify({"error": "Invalid data types"}), 400
# 使用 SQL 語句插入資料
insert_query = f"INSERT INTO t_1 (id, name, age) VALUES ({data['id']}, '{data['name']}', {data['age']})"
spark.sql(insert_query)
return jsonify({"message": "Data inserted into Hive table 't_1'"}), 200
except Exception as e:
app.logger.error(f"Error occurred: {str(e)}")
return jsonify({"error": str(e)}), 500
# 查:讀取Hive表中的資料
@app.route('/read', methods=['GET'])
def read_data():
# 查詢Hive表中的資料
df = spark.sql("SELECT * FROM t_1")
# 將結果轉換為JSON
data = df.collect()
result = [{"id": row["id"],"name": row["name"], "age": row["age"]} for row in data]
return jsonify(result)
# 改:更新Hive表中的資料(透過重寫實現)
@app.route('/update', methods=['POST'])
def update_data():
data = request.json
if not data:
return jsonify({"error": "No data provided"}), 400
try:
# 建立 SparkSession(假設 SparkSession 已經全域性初始化)
spark = SparkSession.builder \
.appName("Hive Update") \
.enableHiveSupport() \
.getOrCreate()
# 將請求資料轉換為 DataFrame,並顯式指定 age 為 IntegerType
new_df = spark.createDataFrame(
[(int(data['id']), data['name'], int(data['age']))], # 將 age 轉換為 int
schema=['id', 'name', 'age']
)
# 將新資料寫入到臨時表
new_df.createOrReplaceTempView("temp_table")
# 使用臨時表中的資料來更新目標表
spark.sql("""
INSERT OVERWRITE TABLE t_1
SELECT * FROM (
SELECT id, name, age FROM t_1
UNION ALL
SELECT id, name, age FROM temp_table
) AS combined
GROUP BY id, name, age
""")
return jsonify({"message": "Data updated successfully"}), 200
except Exception as e:
print(e)
return jsonify({"error": str(e)}), 500
# 刪:從Hive表中刪除資料
@app.route('/delete', methods=['POST'])
def delete_data():
condition = request.json.get('condition', {})
# 讀取現有資料
df = spark.sql("SELECT * FROM t_1")
# 刪除條件:例如刪除年齡小於某個值的資料
if "age" in condition:
age_limit = condition["age"]
df = df.filter(df["age"] >= age_limit)
# 重新寫入篩選後的資料到Hive表
df.write.mode('overwrite').saveAsTable("t_1")
return jsonify({"message": f"Deleted data where age < {age_limit}"})
if __name__ == '__main__':
app.run(host='0.0.0.0', port=5000)