解析資料踩過的坑
import sys
from pyspark.sql.types import StructType, StringType, StructField
reload(sys)
sys.setdefaultencoding('utf8')
# Path for spark source folder
import os
# os.environ['SPARK_HOME'] = "/opt/spark-2.0.1-bin-hadoop2.7/opt/spark-2.0.1-bin-hadoop2.7"
# Append pyspark to Python Path
# sys.path.append("/opt/spark-2.0.1-bin-hadoop2.7/python/")
# sys.path.append("/opt/spark-2.0.1-bin-hadoop2.7/python/lib/")
try:
from pyspark import SparkContext
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
from pyspark.sql import DataFrame
from pyspark.sql import Row
print("Successfully imported Spark Modules")
except ImportError as e:
print("Can not import Spark Modules", e)
sys.exit(1)
from pyspark.sql import SparkSession, SQLContext
from pyspark.sql import Row
import os
sparkpath = "spark://192.168.31.10:7077"
# Path for spark source folder
# os.environ['SPARK_HOME'] = "/opt/spark-2.0.1-bin-hadoop2.7"
# Append pyspark to Python Path
# sys.path.append("/opt/spark-2.0.1-bin-hadoop2.7/python/")
# sys.path.append("/opt/spark-2.0.1-bin-hadoop2.7/python/lib/")
hdfspath = 'hdfs://192.168.31.10:9000/hdfs/riskModelAuto/2018-07-24/raw/courtnotice.csv'
hdfspath_1 = 'hdfs://192.168.31.10:9000/hdfs/riskModelAuto/2018-07-24/raw/judgedoc_litigant.txt'
hdfspath_2 = 'hdfs://192.168.31.10:9000/hdfs/riskModelAuto/2018-07-24/raw/judgedoc_detail_tmp.csv'
hdfspath_3 ='hdfs://192.168.31.10:9000/hdfs/riskModelAuto/2018-07-24/feature/feature_extract_shixin.csv'
hdfspath_4 ='hdfs://192.168.31.10:9000/hdfs/riskModelNotDaily/mid/saic/share_as_fr.csv'
hdfs_temp = 'hdfs://192.168.31.10:9000/hdfs/tmp/statistic/update_trade_cnt_feature_data.csv'
hdfs_temp_02 = 'hdfs://192.168.31.10:9000/hdfs/tmp/statistic/new_update_lgbm_model_data.csv'
hdfs_temp_03 = 'hdfs://192.168.31.10:9000/hdfs/tmp/statistic/test_revoke_features_to_results.csv '
trademark_raw = 'hdfs://192.168.31.10:9000/hdfs/riskModelAuto/2018-07-24/raw/trademark.csv'
trademark_mid = 'hdfs://192.168.31.10:9000/hdfs/riskModelAuto/2018-07-24/mid/trademark_mid.csv'
trademark_feature ='hdfs://192.168.31.10:9000/hdfs/riskModelAuto/2018-07-24/feature/feature_extract_trademark.csv'
feature_extract_judgedoc ='hdfs://192.168.31.10:9000/hdfs/riskModelAuto/2018-07-24/feature/feature_extract_judgedoc.csv'
judgedoc_litigant_mid = 'hdfs://192.168.31.10:9000/hdfs/riskModelAuto/2018-07-24/mid/judgedoc_litigant.csv'
judgedoc_litigant_raw = 'hdfs://192.168.31.10:9000/hdfs/riskModelAuto/2018-07-24/raw/judgedoc_litigant.txt'
#對外投資公司執行次數
network_c_inv_c_zhixing_cnt_temp ='hdfs://192.168.31.10:9000/hdfs/riskModelAuto/2018-07-24/feature/network_c_inv_c_zhixing_cnt.csv'
# 公司型別的股東
raw_path_c_inv_c_share_from_saic = "hdfs://192.168.31.10:9000/hdfs/riskModelNotDaily/mid/list_company_a/c_inv_c_share.csv"
#該公司沒有公司型別的股東;
#公司 ---投資---> 公司 /hdfs/riskModelNotDaily/raw/saic/company_inv_company.csv
company_inv_company = "hdfs://192.168.31.10:9000/hdfs/riskModelNotDaily/raw/saic/company_inv_company.csv"
#主體公司的股東投資的公司
rst_c_inv_c_inv_c = "hdfs://192.168.31.10:9000/hdfs/riskModelNotDaily/mid/network/c_inv_c_inv_c.csv"
#主體公司股東對外作為股東的公司
share_as_share_output = "hdfs://192.168.31.10:9000/hdfs/riskModelNotDaily/mid/network/share_as_share.csv"
cq_bank_1000 = 'hdfs://192.168.31.10:9000/hdfs/tmp/statistic/cq_bank.csv'
shixin = 'hdfs://192.168.31.10:9000/hdfs/riskModelAuto/2018-07-24/raw/shixin.csv'
punish = 'hdfs://192.168.31.10:9000/hdfs/tmp/statistic/punish.csv'
punish_litigants = 'hdfs://192.168.31.10:9000/hdfs/tmp/statistic/punish_litigants.csv'
patent = 'hdfs://192.168.31.10:9000/hdfs/tmp/statistic/company_patent.csv'
# trademark = 'hdfs://192.168.31.10:9000/hdfs/riskModelAuto/2018-07-24/mid/trademark_mid.csv'
courtnotice = 'hdfs://192.168.31.10:9000/hdfs/riskModelAuto/2018-07-24/mid/courtnotice_mid.csv'
courtannouncement = 'hdfs://192.168.31.10:9000/hdfs/tmp/statistic/part-r-00000-83ceb54d-a0ac-4c67-b8c9-948bb2d11aa4.csv'
courtsszc_company = 'hdfs://192.168.31.10:9000/hdfs/tmp/statistic/court_sszc_crawl_company.csv'
courtsszc_detail = 'hdfs://192.168.31.10:9000/hdfs/tmp/statistic/court_sszc_crawl.csv'
judgedoc_company = 'hdfs://192.168.31.10:9000/hdfs/riskModelAuto/2018-07-24/raw/judgedoc_litigant.csv'
judgedoc_detail = 'hdfs://192.168.31.10:9000/hdfs/riskModelAuto/2018-07-24/raw/judgedoc.csv'
shixin_company = 'hdfs://192.168.31.10:9000/hdfs/tmp/statistic/court_shixin_company.csv'
trademark = 'hdfs://192.168.31.10:9000/hdfs/tmp/statistic/trademark.csv'
# punish = 'hdfs://192.168.31.10:9000/hdfs/tmp/statistic/punish.csv'
# punish_litigants = 'hdfs://192.168.31.10:9000/hdfs/tmp/statistic/punish_litigants.csv'
# df1 = pd.read_csv('/home/sc/Downloads/staticdata/3.01_data/update_trade_cnt_feature_data.csv')
# print(df1.groupby(df1['_c1']).size())
def test_courtsszc(spark,sc):
sqlContext = SQLContext(sparkContext=sc)
dfkk2 = sqlContext.read.csv(
courtsszc_company,header = True)
dfkk2.createOrReplaceTempView('y2')
dfkk3 = sqlContext.read.csv(
courtsszc_detail, header=True)
dfkk3.createOrReplaceTempView('y3')
# dfkk2.show()
dfhh1 = sqlContext.sql(
"""select y2.company_name,y3.* from y2 left join y3 on y2.sc_data_id = y3.sc_data_id
""")
dfhh1.show()
dfhh1.createOrReplaceTempView('hh1')
dfkk4 = sqlContext.read.csv(
cq_bank_1000, header=True)
dfkk4.createOrReplaceTempView('y4')
dfhh2 = sqlContext.sql(
"""select y4.company_name as company_name_a,hh1.* from y4 left join hh1 on y4.company_name = hh1.company_name
""")
dfhh2.show()
dfhh2.repartition(1).write.csv("hdfs://192.168.31.10:9000/hdfs/tmp/statistic/cq_bank_courtsszc.csv",
mode='overwrite', header=True,quoteAll = True)
spark.stop()
def test_judgedoc(spark,sc):
'litigant_name, doc_id,litigant_type,case_type,case_reason,publish_date,trail_date,case_result'
judgedoc_sample_field = ['litigant_name','doc_id','litigant_type','case_type',
'case_reason','publish_date','trail_date','case_result']
judgedoc_sample_schema = StructType(
[StructField(field_name, StringType(), True) for field_name in judgedoc_sample_field])
sqlContext = SQLContext(sparkContext=sc)
dfkk2 = sqlContext.read.csv(
judgedoc_company,header = False,sep = '\t')
dfkk2.createOrReplaceTempView('y2')
# print(dfkk2.columns)
# spark.stop()
dfkk4 = sqlContext.read.csv(
cq_bank_1000, header=True)
dfkk4.createOrReplaceTempView('y4')
dfhh2 = sqlContext.sql(
"""select y4.company_name,y2._c0 as doc_id from y4 left join y2 on y4.company_name = y2._c1
""")
# dfhh2.show()
dfhh2.createOrReplaceTempView('hh2')
dfkk3 = sqlContext.read.csv(
judgedoc_detail,quote="",schema=judgedoc_sample_schema,header=False,nanValue='nan',
nullValue='nan',inferSchema=True)
print(dfkk3.columns)
print(type(dfkk3))
# print(dfkk3[1:3])
spark.stop()
# dfkk3 = dfkk3 \
# .filter(lambda line: len((line["value"] or "").split(',')) == 8) \
# .map(lambda line: Row(**dict(zip(judgedoc_sample_field, line["value"].split(','))))) \
# .toDF(judgedoc_sample_schema)
# # \
dfkk3.createOrReplaceTempView('y3')
# dfkk2.show()
dfhh1 = sqlContext.sql(
"""select hh2.company_name,y3.* from hh2 left join y3 on hh2.doc_id = y3.doc_id
""")
# dfhh1.show()
dfhh1.repartition(1).write.csv("hdfs://192.168.31.10:9000/hdfs/tmp/statistic/cq_bank_judgedoc.csv",
mode='overwrite', header=True,quoteAll = True,sep = '\t')
spark.stop()
def test_shixin(spark,sc):
sqlContext = SQLContext(sparkContext=sc)
dfkk2 = sqlContext.read.csv(
shixin_company,header = True)
dfkk2.createOrReplaceTempView('y2')
dfkk4 = sqlContext.read.csv(
cq_bank_1000, header=True)
dfkk4.createOrReplaceTempView('y4')
dfhh2 = sqlContext.sql(
"""select y4.company_name as company_name_a,y2.* from y4 left join y2 on y4.company_name = y2.company_name
""")
dfhh2.show()
dfhh2.repartition(1).write.csv("hdfs://192.168.31.10:9000/hdfs/tmp/statistic/cq_bank_shixin.csv",
mode='overwrite', header=True, quoteAll = True )
spark.stop()
def test_punish(spark,sc):
sqlContext = SQLContext(sparkContext=sc)
dfkk2 = sqlContext.read.csv(
punish_litigants,header = True)
dfkk2.createOrReplaceTempView('y2')
dfkk4 = sqlContext.read.csv(
cq_bank_1000, header=True)
dfkk4.createOrReplaceTempView('y4')
dfhh3 = sqlContext.sql(
"""select company_name,y2.sc_data_id from y4 left join y2 on y4.company_name = y2.name
""")
dfhh3.createOrReplaceTempView('y3')
dfkk6 = sqlContext.read.csv(
punish, header=True)
dfkk6.createOrReplaceTempView('y6')
dfhh5 = sqlContext.sql(
"""select company_name,y6.* from y3 left join y6 on y3.sc_data_id = y6.sc_data_id
""")
dfhh5.show()
dfhh5.repartition(1).write.csv("hdfs://192.168.31.10:9000/hdfs/tmp/statistic/cq_bank_punish.csv",
mode='overwrite', header=True, quoteAll = True )
spark.stop()
def test_trademark(spark,sc):
sqlContext = SQLContext(sparkContext=sc)
dfkk2 = sqlContext.read.csv(
trademark,header = True)
dfkk2.createOrReplaceTempView('y2')
dfkk4 = sqlContext.read.csv(
cq_bank_1000, header=True)
dfkk4.createOrReplaceTempView('y4')
dfhh3 = sqlContext.sql(
"""select y4.company_name as company_name_a,y2.* from y4 left join y2 on y4.company_name = y2.company_name
""")
dfhh3.repartition(1).write.csv("hdfs://192.168.31.10:9000/hdfs/tmp/statistic/cq_bank_trademark.csv",
mode='overwrite', header=True, quoteAll = True )
spark.stop()
#
if __name__ == '__main__':
spark = SparkSession.builder.master(sparkpath).appName("SC_ETL_ccs_spark") \
.getOrCreate()
# spark.conf.set("spark.driver.maxResultSize", "4g")
# spark.conf.set("spark.sql.broadcastTimeout", 1200)
# spark.conf.set("spark.sql.crossJoin.enabled", "true")
# spark.conf.set("spark.cores.max", "10")
sc = spark.sparkContext
# test_courtsszc(spark, sc)
# test_judgedoc(spark, sc)
# test_shixin(spark, sc)
# test_punish(spark, sc)
test_trademark(spark, sc)
對於儲存的結果表,用csv開啟後為亂碼.可以儲存的時候設定引數:quoteAll = True;就是
引數詳細解釋:
對於裁判文書:
dfkk3 = sqlContext.read.csv(
judgedoc_detail,quote="",schema=judgedoc_sample_schema,header=False,nanValue=‘nan’,
nullValue=‘nan’,inferSchema=True)
讀表的時候有問題,這樣設定引數還是有問題;
相關文章
- 資料處理踩過的坑(不斷更新):
- GeoServer 踩過的坑Server
- 火山引擎DataLeap:「資料血緣」踩過哪些坑?
- 我踩過的聽過的那些坑
- 你踩過flutter的坑嗎Flutter
- JasperReport 中踩過的坑
- VUE 使用中踩過的坑Vue
- 親自踩過的vue的坑Vue
- Vue父子元件通過prop非同步傳輸資料踩坑Vue元件非同步
- 安裝 Laravel/horizon 踩過的坑Laravel
- wepy小程式踩過的坑(1)
- 那些年使用Hive踩過的坑Hive
- Xorm GroupBy 取出的資料異常踩坑ORM
- SparkR連結mysql資料庫(踩坑)SparkMySql資料庫
- uniapp之那些年踩過的坑APP
- Redis 叢集部署及踩過的坑Redis
- 除錯py程式碼踩過的坑除錯
- C++之那些年踩過的坑C++
- 談談出入React框架踩過的坑React框架
- Vue2.0構建——踩過的坑Vue
- 淺談重構中踩過的坑
- Compose 延遲列表踩過的坑
- 踩過的坑(一)——web容器升級Web
- vue專案中踩過的element的坑Vue
- 資料庫讀寫分離這個坑,你應該踩過吧?資料庫
- Nebula Exchange 工具 Hive 資料匯入的踩坑之旅Hive
- 小程式的這些坑你踩過嗎?
- 當年用httpclient時踩過的那些坑HTTPclient
- linux環境壓測踩過的坑Linux
- IT人,那些年,一起踩過的坑
- Android Notification 和曾經踩過的坑Android
- 使用ABP框架中踩過的坑系列2框架
- 騰訊Xlog接入指南與踩過的坑
- 與webview打交道中踩過的那些坑WebView
- HLS直播技術方案及踩過的坑
- 【iOS】小知識點_AND_踩過的坑iOS
- 那些年,在nodejs上踩過的坑NodeJS
- 小白程式設計師最容易踩的“坑”,你踩過幾個?程式設計師