解析資料踩過的坑

一隻勤奮愛思考的豬發表於2018-10-08

import sys

from pyspark.sql.types import StructType, StringType, StructField

reload(sys)
sys.setdefaultencoding('utf8')
# Path for spark source folder
import os

# os.environ['SPARK_HOME'] = "/opt/spark-2.0.1-bin-hadoop2.7/opt/spark-2.0.1-bin-hadoop2.7"

# Append pyspark  to Python Path
# sys.path.append("/opt/spark-2.0.1-bin-hadoop2.7/python/")
# sys.path.append("/opt/spark-2.0.1-bin-hadoop2.7/python/lib/")

try:
    from pyspark import SparkContext
    from pyspark import SparkConf
    from pyspark.sql import SparkSession
    from pyspark.sql import SQLContext
    from pyspark.sql import DataFrame
    from pyspark.sql import Row
    print("Successfully imported Spark Modules")
except ImportError as e:
    print("Can not import Spark Modules", e)
    sys.exit(1)
from pyspark.sql import SparkSession, SQLContext
from pyspark.sql import Row
import os
sparkpath = "spark://192.168.31.10:7077"

# Path for spark source folder
# os.environ['SPARK_HOME'] = "/opt/spark-2.0.1-bin-hadoop2.7"

# Append pyspark  to Python Path
# sys.path.append("/opt/spark-2.0.1-bin-hadoop2.7/python/")
# sys.path.append("/opt/spark-2.0.1-bin-hadoop2.7/python/lib/")

hdfspath = 'hdfs://192.168.31.10:9000/hdfs/riskModelAuto/2018-07-24/raw/courtnotice.csv'
hdfspath_1 = 'hdfs://192.168.31.10:9000/hdfs/riskModelAuto/2018-07-24/raw/judgedoc_litigant.txt'
hdfspath_2 = 'hdfs://192.168.31.10:9000/hdfs/riskModelAuto/2018-07-24/raw/judgedoc_detail_tmp.csv'
hdfspath_3 ='hdfs://192.168.31.10:9000/hdfs/riskModelAuto/2018-07-24/feature/feature_extract_shixin.csv'
hdfspath_4 ='hdfs://192.168.31.10:9000/hdfs/riskModelNotDaily/mid/saic/share_as_fr.csv'
hdfs_temp = 'hdfs://192.168.31.10:9000/hdfs/tmp/statistic/update_trade_cnt_feature_data.csv'
hdfs_temp_02 = 'hdfs://192.168.31.10:9000/hdfs/tmp/statistic/new_update_lgbm_model_data.csv'

hdfs_temp_03 = 'hdfs://192.168.31.10:9000/hdfs/tmp/statistic/test_revoke_features_to_results.csv '
trademark_raw = 'hdfs://192.168.31.10:9000/hdfs/riskModelAuto/2018-07-24/raw/trademark.csv'
trademark_mid = 'hdfs://192.168.31.10:9000/hdfs/riskModelAuto/2018-07-24/mid/trademark_mid.csv'
trademark_feature ='hdfs://192.168.31.10:9000/hdfs/riskModelAuto/2018-07-24/feature/feature_extract_trademark.csv'
feature_extract_judgedoc ='hdfs://192.168.31.10:9000/hdfs/riskModelAuto/2018-07-24/feature/feature_extract_judgedoc.csv'
judgedoc_litigant_mid = 'hdfs://192.168.31.10:9000/hdfs/riskModelAuto/2018-07-24/mid/judgedoc_litigant.csv'
judgedoc_litigant_raw = 'hdfs://192.168.31.10:9000/hdfs/riskModelAuto/2018-07-24/raw/judgedoc_litigant.txt'

#對外投資公司執行次數
network_c_inv_c_zhixing_cnt_temp ='hdfs://192.168.31.10:9000/hdfs/riskModelAuto/2018-07-24/feature/network_c_inv_c_zhixing_cnt.csv'

# 公司型別的股東
raw_path_c_inv_c_share_from_saic = "hdfs://192.168.31.10:9000/hdfs/riskModelNotDaily/mid/list_company_a/c_inv_c_share.csv"
#該公司沒有公司型別的股東;

#公司   ---投資--->  公司  /hdfs/riskModelNotDaily/raw/saic/company_inv_company.csv
company_inv_company = "hdfs://192.168.31.10:9000/hdfs/riskModelNotDaily/raw/saic/company_inv_company.csv"
#主體公司的股東投資的公司
rst_c_inv_c_inv_c = "hdfs://192.168.31.10:9000/hdfs/riskModelNotDaily/mid/network/c_inv_c_inv_c.csv"

#主體公司股東對外作為股東的公司
share_as_share_output = "hdfs://192.168.31.10:9000/hdfs/riskModelNotDaily/mid/network/share_as_share.csv"
cq_bank_1000 = 'hdfs://192.168.31.10:9000/hdfs/tmp/statistic/cq_bank.csv'

shixin = 'hdfs://192.168.31.10:9000/hdfs/riskModelAuto/2018-07-24/raw/shixin.csv'
punish = 'hdfs://192.168.31.10:9000/hdfs/tmp/statistic/punish.csv'
punish_litigants = 'hdfs://192.168.31.10:9000/hdfs/tmp/statistic/punish_litigants.csv'
patent = 'hdfs://192.168.31.10:9000/hdfs/tmp/statistic/company_patent.csv'
# trademark = 'hdfs://192.168.31.10:9000/hdfs/riskModelAuto/2018-07-24/mid/trademark_mid.csv'
courtnotice = 'hdfs://192.168.31.10:9000/hdfs/riskModelAuto/2018-07-24/mid/courtnotice_mid.csv'
courtannouncement = 'hdfs://192.168.31.10:9000/hdfs/tmp/statistic/part-r-00000-83ceb54d-a0ac-4c67-b8c9-948bb2d11aa4.csv'
courtsszc_company = 'hdfs://192.168.31.10:9000/hdfs/tmp/statistic/court_sszc_crawl_company.csv'
courtsszc_detail = 'hdfs://192.168.31.10:9000/hdfs/tmp/statistic/court_sszc_crawl.csv'

judgedoc_company = 'hdfs://192.168.31.10:9000/hdfs/riskModelAuto/2018-07-24/raw/judgedoc_litigant.csv'
judgedoc_detail = 'hdfs://192.168.31.10:9000/hdfs/riskModelAuto/2018-07-24/raw/judgedoc.csv'

shixin_company = 'hdfs://192.168.31.10:9000/hdfs/tmp/statistic/court_shixin_company.csv'
trademark = 'hdfs://192.168.31.10:9000/hdfs/tmp/statistic/trademark.csv'
# punish = 'hdfs://192.168.31.10:9000/hdfs/tmp/statistic/punish.csv'
# punish_litigants = 'hdfs://192.168.31.10:9000/hdfs/tmp/statistic/punish_litigants.csv'

# df1 = pd.read_csv('/home/sc/Downloads/staticdata/3.01_data/update_trade_cnt_feature_data.csv')
# print(df1.groupby(df1['_c1']).size())
def test_courtsszc(spark,sc):
    sqlContext = SQLContext(sparkContext=sc)
    dfkk2 = sqlContext.read.csv(
        courtsszc_company,header = True)
    dfkk2.createOrReplaceTempView('y2')
    dfkk3 = sqlContext.read.csv(
        courtsszc_detail, header=True)
    dfkk3.createOrReplaceTempView('y3')
    # dfkk2.show()
    dfhh1 = sqlContext.sql(
        """select y2.company_name,y3.* from y2 left join y3 on y2.sc_data_id = y3.sc_data_id
         """)
    dfhh1.show()
    dfhh1.createOrReplaceTempView('hh1')
    dfkk4 = sqlContext.read.csv(
        cq_bank_1000, header=True)
    dfkk4.createOrReplaceTempView('y4')
    dfhh2 = sqlContext.sql(
        """select y4.company_name as company_name_a,hh1.* from y4 left join hh1 on y4.company_name = hh1.company_name
         """)
    dfhh2.show()
    dfhh2.repartition(1).write.csv("hdfs://192.168.31.10:9000/hdfs/tmp/statistic/cq_bank_courtsszc.csv",
                                   mode='overwrite', header=True,quoteAll = True)
    spark.stop()

def test_judgedoc(spark,sc):
    'litigant_name, doc_id,litigant_type,case_type,case_reason,publish_date,trail_date,case_result'
    judgedoc_sample_field = ['litigant_name','doc_id','litigant_type','case_type',
                             'case_reason','publish_date','trail_date','case_result']
    judgedoc_sample_schema = StructType(
    [StructField(field_name, StringType(), True) for field_name in judgedoc_sample_field])
    sqlContext = SQLContext(sparkContext=sc)
    dfkk2 = sqlContext.read.csv(
        judgedoc_company,header = False,sep = '\t')
    dfkk2.createOrReplaceTempView('y2')
    # print(dfkk2.columns)
    # spark.stop()
    dfkk4 = sqlContext.read.csv(
        cq_bank_1000, header=True)
    dfkk4.createOrReplaceTempView('y4')
    dfhh2 = sqlContext.sql(
        """select y4.company_name,y2._c0 as doc_id from y4 left join y2 on y4.company_name = y2._c1
         """)
    # dfhh2.show()
    dfhh2.createOrReplaceTempView('hh2')
    dfkk3 = sqlContext.read.csv(
        judgedoc_detail,quote="",schema=judgedoc_sample_schema,header=False,nanValue='nan',
    nullValue='nan',inferSchema=True)
    print(dfkk3.columns)
    print(type(dfkk3))
    # print(dfkk3[1:3])
    spark.stop()
    # dfkk3 = dfkk3 \
    #     .filter(lambda line: len((line["value"] or "").split(',')) == 8) \
    #     .map(lambda line: Row(**dict(zip(judgedoc_sample_field, line["value"].split(','))))) \
    #     .toDF(judgedoc_sample_schema)
    #     #  \

    dfkk3.createOrReplaceTempView('y3')
    # dfkk2.show()
    dfhh1 = sqlContext.sql(
        """select hh2.company_name,y3.* from hh2 left join y3 on hh2.doc_id = y3.doc_id
         """)
    # dfhh1.show()
    dfhh1.repartition(1).write.csv("hdfs://192.168.31.10:9000/hdfs/tmp/statistic/cq_bank_judgedoc.csv",
                                   mode='overwrite', header=True,quoteAll = True,sep = '\t')
    spark.stop()

def test_shixin(spark,sc):
    sqlContext = SQLContext(sparkContext=sc)
    dfkk2 = sqlContext.read.csv(
        shixin_company,header = True)
    dfkk2.createOrReplaceTempView('y2')
    dfkk4 = sqlContext.read.csv(
        cq_bank_1000, header=True)
    dfkk4.createOrReplaceTempView('y4')
    dfhh2 = sqlContext.sql(
        """select y4.company_name as company_name_a,y2.* from y4 left join y2 on y4.company_name = y2.company_name
         """)
    dfhh2.show()
    dfhh2.repartition(1).write.csv("hdfs://192.168.31.10:9000/hdfs/tmp/statistic/cq_bank_shixin.csv",
                                   mode='overwrite', header=True, quoteAll = True )
    spark.stop()

def test_punish(spark,sc):
    sqlContext = SQLContext(sparkContext=sc)
    dfkk2 = sqlContext.read.csv(
        punish_litigants,header = True)
    dfkk2.createOrReplaceTempView('y2')
    dfkk4 = sqlContext.read.csv(
        cq_bank_1000, header=True)
    dfkk4.createOrReplaceTempView('y4')
    dfhh3 = sqlContext.sql(
        """select company_name,y2.sc_data_id from y4 left join y2 on y4.company_name = y2.name
         """)
    dfhh3.createOrReplaceTempView('y3')
    dfkk6 = sqlContext.read.csv(
        punish, header=True)
    dfkk6.createOrReplaceTempView('y6')
    dfhh5 = sqlContext.sql(
        """select company_name,y6.* from y3 left join y6 on y3.sc_data_id = y6.sc_data_id
         """)

    dfhh5.show()
    dfhh5.repartition(1).write.csv("hdfs://192.168.31.10:9000/hdfs/tmp/statistic/cq_bank_punish.csv",
                                   mode='overwrite', header=True, quoteAll = True )
    spark.stop()

def test_trademark(spark,sc):
    sqlContext = SQLContext(sparkContext=sc)
    dfkk2 = sqlContext.read.csv(
        trademark,header = True)
    dfkk2.createOrReplaceTempView('y2')
    dfkk4 = sqlContext.read.csv(
        cq_bank_1000, header=True)
    dfkk4.createOrReplaceTempView('y4')
    dfhh3 = sqlContext.sql(
        """select y4.company_name as company_name_a,y2.* from y4 left join y2 on y4.company_name = y2.company_name
         """)
    dfhh3.repartition(1).write.csv("hdfs://192.168.31.10:9000/hdfs/tmp/statistic/cq_bank_trademark.csv",
                                   mode='overwrite', header=True, quoteAll = True )
    spark.stop()

#
if __name__ == '__main__':

    spark = SparkSession.builder.master(sparkpath).appName("SC_ETL_ccs_spark") \
        .getOrCreate()
    # spark.conf.set("spark.driver.maxResultSize", "4g")
    # spark.conf.set("spark.sql.broadcastTimeout", 1200)
    # spark.conf.set("spark.sql.crossJoin.enabled", "true")
    # spark.conf.set("spark.cores.max", "10")
    sc = spark.sparkContext
    # test_courtsszc(spark, sc)
    # test_judgedoc(spark, sc)
    # test_shixin(spark, sc)
    # test_punish(spark, sc)
    test_trademark(spark, sc)

對於儲存的結果表,用csv開啟後為亂碼.可以儲存的時候設定引數:quoteAll = True;就是
引數詳細解釋:

對於裁判文書:
dfkk3 = sqlContext.read.csv(
judgedoc_detail,quote="",schema=judgedoc_sample_schema,header=False,nanValue=‘nan’,
nullValue=‘nan’,inferSchema=True)
讀表的時候有問題,這樣設定引數還是有問題;

資料處理踩過的坑（不斷更新）：
2019-04-18
GeoServer 踩過的坑
2018-03-13
Server
火山引擎DataLeap：「資料血緣」踩過哪些坑？
2023-02-28
JasperReport 中踩過的坑
2018-08-13
你踩過flutter的坑嗎
2019-04-22
Flutter
親自踩過的vue的坑
2018-03-14
Vue
Vue父子元件通過prop非同步傳輸資料踩坑
2018-07-04
Vue元件非同步
wepy小程式踩過的坑(1)
2018-08-07
安裝 Laravel/horizon 踩過的坑
2019-09-28
Laravel
Compose 延遲列表踩過的坑
2024-06-27
uniapp之那些年踩過的坑
2020-12-04
APP
Xorm GroupBy 取出的資料異常踩坑
2021-11-11
ORM
Redis 叢集部署及踩過的坑
2018-03-13
Redis
踩過的坑（一）——web容器升級
2024-08-19
Web
linux環境壓測踩過的坑
2021-01-05
Linux
SparkR連結mysql資料庫（踩坑）
2018-05-21
SparkMySql資料庫
vue專案中踩過的element的坑
2021-08-22
Vue
資料庫讀寫分離這個坑，你應該踩過吧？
2020-12-09
資料庫
Nebula Exchange 工具 Hive 資料匯入的踩坑之旅
2021-01-11
Hive
小程式的這些坑你踩過嗎？
2019-03-04
使用ABP框架中踩過的坑系列2
2018-06-08
框架
當年用httpclient時踩過的那些坑
2019-08-21
HTTPclient
IDEA建立Maven專案中踩過的坑
2020-11-06
IdeaMaven
記錄自己在tensorflow中踩過的坑
2020-10-16
IT人，那些年，一起踩過的坑
2021-12-01
小白程式設計師最容易踩的“坑”，你踩過幾個？
2020-02-12
程式設計師
踩了個DNS解析的坑，但我還是沒想通
2022-04-12
DNS
golang的踩坑
2018-08-14
Golang
nrf52832藍芽開發踩過的坑
2021-11-20
藍芽
laravel8資料庫操作之group by踩坑
2021-01-19
Laravel資料庫
Charles在windows下的安裝以及踩過的坑
2020-12-05
Windows
PX4配置過程與踩坑
2022-05-31
那些年我們一起踩過的Dubbo坑
2019-03-28
曾經面試踩過的坑，都在這裡了～
2018-10-09
面試
面試中的這些坑，你踩過幾個？
2018-09-10
面試
Vue2.0配置mint-ui踩過的那些坑
2018-03-09
VueUI
System.Net郵件傳送功能踩過的坑
2020-11-10
【血淚教訓】godaddy競拍域名踩過的坑
2020-06-12
Go

解析資料踩過的坑

相關文章