解析資料踩過的坑
import sys
from pyspark.sql.types import StructType, StringType, StructField
reload(sys)
sys.setdefaultencoding('utf8')
# Path for spark source folder
import os
# os.environ['SPARK_HOME'] = "/opt/spark-2.0.1-bin-hadoop2.7/opt/spark-2.0.1-bin-hadoop2.7"
# Append pyspark to Python Path
# sys.path.append("/opt/spark-2.0.1-bin-hadoop2.7/python/")
# sys.path.append("/opt/spark-2.0.1-bin-hadoop2.7/python/lib/")
try:
from pyspark import SparkContext
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
from pyspark.sql import DataFrame
from pyspark.sql import Row
print("Successfully imported Spark Modules")
except ImportError as e:
print("Can not import Spark Modules", e)
sys.exit(1)
from pyspark.sql import SparkSession, SQLContext
from pyspark.sql import Row
import os
sparkpath = "spark://192.168.31.10:7077"
# Path for spark source folder
# os.environ['SPARK_HOME'] = "/opt/spark-2.0.1-bin-hadoop2.7"
# Append pyspark to Python Path
# sys.path.append("/opt/spark-2.0.1-bin-hadoop2.7/python/")
# sys.path.append("/opt/spark-2.0.1-bin-hadoop2.7/python/lib/")
hdfspath = 'hdfs://192.168.31.10:9000/hdfs/riskModelAuto/2018-07-24/raw/courtnotice.csv'
hdfspath_1 = 'hdfs://192.168.31.10:9000/hdfs/riskModelAuto/2018-07-24/raw/judgedoc_litigant.txt'
hdfspath_2 = 'hdfs://192.168.31.10:9000/hdfs/riskModelAuto/2018-07-24/raw/judgedoc_detail_tmp.csv'
hdfspath_3 ='hdfs://192.168.31.10:9000/hdfs/riskModelAuto/2018-07-24/feature/feature_extract_shixin.csv'
hdfspath_4 ='hdfs://192.168.31.10:9000/hdfs/riskModelNotDaily/mid/saic/share_as_fr.csv'
hdfs_temp = 'hdfs://192.168.31.10:9000/hdfs/tmp/statistic/update_trade_cnt_feature_data.csv'
hdfs_temp_02 = 'hdfs://192.168.31.10:9000/hdfs/tmp/statistic/new_update_lgbm_model_data.csv'
hdfs_temp_03 = 'hdfs://192.168.31.10:9000/hdfs/tmp/statistic/test_revoke_features_to_results.csv '
trademark_raw = 'hdfs://192.168.31.10:9000/hdfs/riskModelAuto/2018-07-24/raw/trademark.csv'
trademark_mid = 'hdfs://192.168.31.10:9000/hdfs/riskModelAuto/2018-07-24/mid/trademark_mid.csv'
trademark_feature ='hdfs://192.168.31.10:9000/hdfs/riskModelAuto/2018-07-24/feature/feature_extract_trademark.csv'
feature_extract_judgedoc ='hdfs://192.168.31.10:9000/hdfs/riskModelAuto/2018-07-24/feature/feature_extract_judgedoc.csv'
judgedoc_litigant_mid = 'hdfs://192.168.31.10:9000/hdfs/riskModelAuto/2018-07-24/mid/judgedoc_litigant.csv'
judgedoc_litigant_raw = 'hdfs://192.168.31.10:9000/hdfs/riskModelAuto/2018-07-24/raw/judgedoc_litigant.txt'
#對外投資公司執行次數
network_c_inv_c_zhixing_cnt_temp ='hdfs://192.168.31.10:9000/hdfs/riskModelAuto/2018-07-24/feature/network_c_inv_c_zhixing_cnt.csv'
# 公司型別的股東
raw_path_c_inv_c_share_from_saic = "hdfs://192.168.31.10:9000/hdfs/riskModelNotDaily/mid/list_company_a/c_inv_c_share.csv"
#該公司沒有公司型別的股東;
#公司 ---投資---> 公司 /hdfs/riskModelNotDaily/raw/saic/company_inv_company.csv
company_inv_company = "hdfs://192.168.31.10:9000/hdfs/riskModelNotDaily/raw/saic/company_inv_company.csv"
#主體公司的股東投資的公司
rst_c_inv_c_inv_c = "hdfs://192.168.31.10:9000/hdfs/riskModelNotDaily/mid/network/c_inv_c_inv_c.csv"
#主體公司股東對外作為股東的公司
share_as_share_output = "hdfs://192.168.31.10:9000/hdfs/riskModelNotDaily/mid/network/share_as_share.csv"
cq_bank_1000 = 'hdfs://192.168.31.10:9000/hdfs/tmp/statistic/cq_bank.csv'
shixin = 'hdfs://192.168.31.10:9000/hdfs/riskModelAuto/2018-07-24/raw/shixin.csv'
punish = 'hdfs://192.168.31.10:9000/hdfs/tmp/statistic/punish.csv'
punish_litigants = 'hdfs://192.168.31.10:9000/hdfs/tmp/statistic/punish_litigants.csv'
patent = 'hdfs://192.168.31.10:9000/hdfs/tmp/statistic/company_patent.csv'
# trademark = 'hdfs://192.168.31.10:9000/hdfs/riskModelAuto/2018-07-24/mid/trademark_mid.csv'
courtnotice = 'hdfs://192.168.31.10:9000/hdfs/riskModelAuto/2018-07-24/mid/courtnotice_mid.csv'
courtannouncement = 'hdfs://192.168.31.10:9000/hdfs/tmp/statistic/part-r-00000-83ceb54d-a0ac-4c67-b8c9-948bb2d11aa4.csv'
courtsszc_company = 'hdfs://192.168.31.10:9000/hdfs/tmp/statistic/court_sszc_crawl_company.csv'
courtsszc_detail = 'hdfs://192.168.31.10:9000/hdfs/tmp/statistic/court_sszc_crawl.csv'
judgedoc_company = 'hdfs://192.168.31.10:9000/hdfs/riskModelAuto/2018-07-24/raw/judgedoc_litigant.csv'
judgedoc_detail = 'hdfs://192.168.31.10:9000/hdfs/riskModelAuto/2018-07-24/raw/judgedoc.csv'
shixin_company = 'hdfs://192.168.31.10:9000/hdfs/tmp/statistic/court_shixin_company.csv'
trademark = 'hdfs://192.168.31.10:9000/hdfs/tmp/statistic/trademark.csv'
# punish = 'hdfs://192.168.31.10:9000/hdfs/tmp/statistic/punish.csv'
# punish_litigants = 'hdfs://192.168.31.10:9000/hdfs/tmp/statistic/punish_litigants.csv'
# df1 = pd.read_csv('/home/sc/Downloads/staticdata/3.01_data/update_trade_cnt_feature_data.csv')
# print(df1.groupby(df1['_c1']).size())
def test_courtsszc(spark,sc):
sqlContext = SQLContext(sparkContext=sc)
dfkk2 = sqlContext.read.csv(
courtsszc_company,header = True)
dfkk2.createOrReplaceTempView('y2')
dfkk3 = sqlContext.read.csv(
courtsszc_detail, header=True)
dfkk3.createOrReplaceTempView('y3')
# dfkk2.show()
dfhh1 = sqlContext.sql(
"""select y2.company_name,y3.* from y2 left join y3 on y2.sc_data_id = y3.sc_data_id
""")
dfhh1.show()
dfhh1.createOrReplaceTempView('hh1')
dfkk4 = sqlContext.read.csv(
cq_bank_1000, header=True)
dfkk4.createOrReplaceTempView('y4')
dfhh2 = sqlContext.sql(
"""select y4.company_name as company_name_a,hh1.* from y4 left join hh1 on y4.company_name = hh1.company_name
""")
dfhh2.show()
dfhh2.repartition(1).write.csv("hdfs://192.168.31.10:9000/hdfs/tmp/statistic/cq_bank_courtsszc.csv",
mode='overwrite', header=True,quoteAll = True)
spark.stop()
def test_judgedoc(spark,sc):
'litigant_name, doc_id,litigant_type,case_type,case_reason,publish_date,trail_date,case_result'
judgedoc_sample_field = ['litigant_name','doc_id','litigant_type','case_type',
'case_reason','publish_date','trail_date','case_result']
judgedoc_sample_schema = StructType(
[StructField(field_name, StringType(), True) for field_name in judgedoc_sample_field])
sqlContext = SQLContext(sparkContext=sc)
dfkk2 = sqlContext.read.csv(
judgedoc_company,header = False,sep = '\t')
dfkk2.createOrReplaceTempView('y2')
# print(dfkk2.columns)
# spark.stop()
dfkk4 = sqlContext.read.csv(
cq_bank_1000, header=True)
dfkk4.createOrReplaceTempView('y4')
dfhh2 = sqlContext.sql(
"""select y4.company_name,y2._c0 as doc_id from y4 left join y2 on y4.company_name = y2._c1
""")
# dfhh2.show()
dfhh2.createOrReplaceTempView('hh2')
dfkk3 = sqlContext.read.csv(
judgedoc_detail,quote="",schema=judgedoc_sample_schema,header=False,nanValue='nan',
nullValue='nan',inferSchema=True)
print(dfkk3.columns)
print(type(dfkk3))
# print(dfkk3[1:3])
spark.stop()
# dfkk3 = dfkk3 \
# .filter(lambda line: len((line["value"] or "").split(',')) == 8) \
# .map(lambda line: Row(**dict(zip(judgedoc_sample_field, line["value"].split(','))))) \
# .toDF(judgedoc_sample_schema)
# # \
dfkk3.createOrReplaceTempView('y3')
# dfkk2.show()
dfhh1 = sqlContext.sql(
"""select hh2.company_name,y3.* from hh2 left join y3 on hh2.doc_id = y3.doc_id
""")
# dfhh1.show()
dfhh1.repartition(1).write.csv("hdfs://192.168.31.10:9000/hdfs/tmp/statistic/cq_bank_judgedoc.csv",
mode='overwrite', header=True,quoteAll = True,sep = '\t')
spark.stop()
def test_shixin(spark,sc):
sqlContext = SQLContext(sparkContext=sc)
dfkk2 = sqlContext.read.csv(
shixin_company,header = True)
dfkk2.createOrReplaceTempView('y2')
dfkk4 = sqlContext.read.csv(
cq_bank_1000, header=True)
dfkk4.createOrReplaceTempView('y4')
dfhh2 = sqlContext.sql(
"""select y4.company_name as company_name_a,y2.* from y4 left join y2 on y4.company_name = y2.company_name
""")
dfhh2.show()
dfhh2.repartition(1).write.csv("hdfs://192.168.31.10:9000/hdfs/tmp/statistic/cq_bank_shixin.csv",
mode='overwrite', header=True, quoteAll = True )
spark.stop()
def test_punish(spark,sc):
sqlContext = SQLContext(sparkContext=sc)
dfkk2 = sqlContext.read.csv(
punish_litigants,header = True)
dfkk2.createOrReplaceTempView('y2')
dfkk4 = sqlContext.read.csv(
cq_bank_1000, header=True)
dfkk4.createOrReplaceTempView('y4')
dfhh3 = sqlContext.sql(
"""select company_name,y2.sc_data_id from y4 left join y2 on y4.company_name = y2.name
""")
dfhh3.createOrReplaceTempView('y3')
dfkk6 = sqlContext.read.csv(
punish, header=True)
dfkk6.createOrReplaceTempView('y6')
dfhh5 = sqlContext.sql(
"""select company_name,y6.* from y3 left join y6 on y3.sc_data_id = y6.sc_data_id
""")
dfhh5.show()
dfhh5.repartition(1).write.csv("hdfs://192.168.31.10:9000/hdfs/tmp/statistic/cq_bank_punish.csv",
mode='overwrite', header=True, quoteAll = True )
spark.stop()
def test_trademark(spark,sc):
sqlContext = SQLContext(sparkContext=sc)
dfkk2 = sqlContext.read.csv(
trademark,header = True)
dfkk2.createOrReplaceTempView('y2')
dfkk4 = sqlContext.read.csv(
cq_bank_1000, header=True)
dfkk4.createOrReplaceTempView('y4')
dfhh3 = sqlContext.sql(
"""select y4.company_name as company_name_a,y2.* from y4 left join y2 on y4.company_name = y2.company_name
""")
dfhh3.repartition(1).write.csv("hdfs://192.168.31.10:9000/hdfs/tmp/statistic/cq_bank_trademark.csv",
mode='overwrite', header=True, quoteAll = True )
spark.stop()
#
if __name__ == '__main__':
spark = SparkSession.builder.master(sparkpath).appName("SC_ETL_ccs_spark") \
.getOrCreate()
# spark.conf.set("spark.driver.maxResultSize", "4g")
# spark.conf.set("spark.sql.broadcastTimeout", 1200)
# spark.conf.set("spark.sql.crossJoin.enabled", "true")
# spark.conf.set("spark.cores.max", "10")
sc = spark.sparkContext
# test_courtsszc(spark, sc)
# test_judgedoc(spark, sc)
# test_shixin(spark, sc)
# test_punish(spark, sc)
test_trademark(spark, sc)
對於儲存的結果表,用csv開啟後為亂碼.可以儲存的時候設定引數:quoteAll = True;就是
引數詳細解釋:
對於裁判文書:
dfkk3 = sqlContext.read.csv(
judgedoc_detail,quote="",schema=judgedoc_sample_schema,header=False,nanValue=‘nan’,
nullValue=‘nan’,inferSchema=True)
讀表的時候有問題,這樣設定引數還是有問題;
相關文章
- 資料處理踩過的坑(不斷更新):
- GeoServer 踩過的坑Server
- 火山引擎DataLeap:「資料血緣」踩過哪些坑?
- JasperReport 中踩過的坑
- 你踩過flutter的坑嗎Flutter
- 親自踩過的vue的坑Vue
- Vue父子元件通過prop非同步傳輸資料踩坑Vue元件非同步
- wepy小程式踩過的坑(1)
- 安裝 Laravel/horizon 踩過的坑Laravel
- Compose 延遲列表踩過的坑
- uniapp之那些年踩過的坑APP
- Xorm GroupBy 取出的資料異常踩坑ORM
- Redis 叢集部署及踩過的坑Redis
- 踩過的坑(一)——web容器升級Web
- linux環境壓測踩過的坑Linux
- SparkR連結mysql資料庫(踩坑)SparkMySql資料庫
- vue專案中踩過的element的坑Vue
- 資料庫讀寫分離這個坑,你應該踩過吧?資料庫
- Nebula Exchange 工具 Hive 資料匯入的踩坑之旅Hive
- 小程式的這些坑你踩過嗎?
- 使用ABP框架中踩過的坑系列2框架
- 當年用httpclient時踩過的那些坑HTTPclient
- IDEA建立Maven專案中踩過的坑IdeaMaven
- 記錄自己在tensorflow中踩過的坑
- IT人,那些年,一起踩過的坑
- 小白程式設計師最容易踩的“坑”,你踩過幾個?程式設計師
- 踩了個DNS解析的坑,但我還是沒想通DNS
- golang的踩坑Golang
- nrf52832藍芽開發踩過的坑藍芽
- laravel8資料庫操作之group by踩坑Laravel資料庫
- Charles在windows下的安裝以及踩過的坑Windows
- PX4配置過程與踩坑
- 那些年我們一起踩過的Dubbo坑
- 曾經面試踩過的坑,都在這裡了~面試
- 面試中的這些坑,你踩過幾個?面試
- Vue2.0配置mint-ui踩過的那些坑VueUI
- System.Net郵件傳送功能踩過的坑
- 【血淚教訓】godaddy競拍域名踩過的坑Go