解析資料踩過的坑

一隻勤奮愛思考的豬發表於2018-10-08
import sys

from pyspark.sql.types import StructType, StringType, StructField

reload(sys)
sys.setdefaultencoding('utf8')
# Path for spark source folder
import os

# os.environ['SPARK_HOME'] = "/opt/spark-2.0.1-bin-hadoop2.7/opt/spark-2.0.1-bin-hadoop2.7"

# Append pyspark  to Python Path
# sys.path.append("/opt/spark-2.0.1-bin-hadoop2.7/python/")
# sys.path.append("/opt/spark-2.0.1-bin-hadoop2.7/python/lib/")

try:
    from pyspark import SparkContext
    from pyspark import SparkConf
    from pyspark.sql import SparkSession
    from pyspark.sql import SQLContext
    from pyspark.sql import DataFrame
    from pyspark.sql import Row
    print("Successfully imported Spark Modules")
except ImportError as e:
    print("Can not import Spark Modules", e)
    sys.exit(1)
from pyspark.sql import SparkSession, SQLContext
from pyspark.sql import Row
import os
sparkpath = "spark://192.168.31.10:7077"

# Path for spark source folder
# os.environ['SPARK_HOME'] = "/opt/spark-2.0.1-bin-hadoop2.7"

# Append pyspark  to Python Path
# sys.path.append("/opt/spark-2.0.1-bin-hadoop2.7/python/")
# sys.path.append("/opt/spark-2.0.1-bin-hadoop2.7/python/lib/")

hdfspath = 'hdfs://192.168.31.10:9000/hdfs/riskModelAuto/2018-07-24/raw/courtnotice.csv'
hdfspath_1 = 'hdfs://192.168.31.10:9000/hdfs/riskModelAuto/2018-07-24/raw/judgedoc_litigant.txt'
hdfspath_2 = 'hdfs://192.168.31.10:9000/hdfs/riskModelAuto/2018-07-24/raw/judgedoc_detail_tmp.csv'
hdfspath_3 ='hdfs://192.168.31.10:9000/hdfs/riskModelAuto/2018-07-24/feature/feature_extract_shixin.csv'
hdfspath_4 ='hdfs://192.168.31.10:9000/hdfs/riskModelNotDaily/mid/saic/share_as_fr.csv'
hdfs_temp = 'hdfs://192.168.31.10:9000/hdfs/tmp/statistic/update_trade_cnt_feature_data.csv'
hdfs_temp_02 = 'hdfs://192.168.31.10:9000/hdfs/tmp/statistic/new_update_lgbm_model_data.csv'

hdfs_temp_03 = 'hdfs://192.168.31.10:9000/hdfs/tmp/statistic/test_revoke_features_to_results.csv '
trademark_raw = 'hdfs://192.168.31.10:9000/hdfs/riskModelAuto/2018-07-24/raw/trademark.csv'
trademark_mid = 'hdfs://192.168.31.10:9000/hdfs/riskModelAuto/2018-07-24/mid/trademark_mid.csv'
trademark_feature ='hdfs://192.168.31.10:9000/hdfs/riskModelAuto/2018-07-24/feature/feature_extract_trademark.csv'
feature_extract_judgedoc ='hdfs://192.168.31.10:9000/hdfs/riskModelAuto/2018-07-24/feature/feature_extract_judgedoc.csv'
judgedoc_litigant_mid = 'hdfs://192.168.31.10:9000/hdfs/riskModelAuto/2018-07-24/mid/judgedoc_litigant.csv'
judgedoc_litigant_raw = 'hdfs://192.168.31.10:9000/hdfs/riskModelAuto/2018-07-24/raw/judgedoc_litigant.txt'

#對外投資公司執行次數
network_c_inv_c_zhixing_cnt_temp ='hdfs://192.168.31.10:9000/hdfs/riskModelAuto/2018-07-24/feature/network_c_inv_c_zhixing_cnt.csv'

# 公司型別的股東
raw_path_c_inv_c_share_from_saic = "hdfs://192.168.31.10:9000/hdfs/riskModelNotDaily/mid/list_company_a/c_inv_c_share.csv"
#該公司沒有公司型別的股東;

#公司   ---投資--->  公司  /hdfs/riskModelNotDaily/raw/saic/company_inv_company.csv
company_inv_company = "hdfs://192.168.31.10:9000/hdfs/riskModelNotDaily/raw/saic/company_inv_company.csv"
#主體公司的股東投資的公司
rst_c_inv_c_inv_c = "hdfs://192.168.31.10:9000/hdfs/riskModelNotDaily/mid/network/c_inv_c_inv_c.csv"

#主體公司股東對外作為股東的公司
share_as_share_output = "hdfs://192.168.31.10:9000/hdfs/riskModelNotDaily/mid/network/share_as_share.csv"
cq_bank_1000 = 'hdfs://192.168.31.10:9000/hdfs/tmp/statistic/cq_bank.csv'

shixin = 'hdfs://192.168.31.10:9000/hdfs/riskModelAuto/2018-07-24/raw/shixin.csv'
punish = 'hdfs://192.168.31.10:9000/hdfs/tmp/statistic/punish.csv'
punish_litigants = 'hdfs://192.168.31.10:9000/hdfs/tmp/statistic/punish_litigants.csv'
patent = 'hdfs://192.168.31.10:9000/hdfs/tmp/statistic/company_patent.csv'
# trademark = 'hdfs://192.168.31.10:9000/hdfs/riskModelAuto/2018-07-24/mid/trademark_mid.csv'
courtnotice = 'hdfs://192.168.31.10:9000/hdfs/riskModelAuto/2018-07-24/mid/courtnotice_mid.csv'
courtannouncement = 'hdfs://192.168.31.10:9000/hdfs/tmp/statistic/part-r-00000-83ceb54d-a0ac-4c67-b8c9-948bb2d11aa4.csv'
courtsszc_company = 'hdfs://192.168.31.10:9000/hdfs/tmp/statistic/court_sszc_crawl_company.csv'
courtsszc_detail = 'hdfs://192.168.31.10:9000/hdfs/tmp/statistic/court_sszc_crawl.csv'

judgedoc_company = 'hdfs://192.168.31.10:9000/hdfs/riskModelAuto/2018-07-24/raw/judgedoc_litigant.csv'
judgedoc_detail = 'hdfs://192.168.31.10:9000/hdfs/riskModelAuto/2018-07-24/raw/judgedoc.csv'

shixin_company = 'hdfs://192.168.31.10:9000/hdfs/tmp/statistic/court_shixin_company.csv'
trademark = 'hdfs://192.168.31.10:9000/hdfs/tmp/statistic/trademark.csv'
# punish = 'hdfs://192.168.31.10:9000/hdfs/tmp/statistic/punish.csv'
# punish_litigants = 'hdfs://192.168.31.10:9000/hdfs/tmp/statistic/punish_litigants.csv'

# df1 = pd.read_csv('/home/sc/Downloads/staticdata/3.01_data/update_trade_cnt_feature_data.csv')
# print(df1.groupby(df1['_c1']).size())
def test_courtsszc(spark,sc):
    sqlContext = SQLContext(sparkContext=sc)
    dfkk2 = sqlContext.read.csv(
        courtsszc_company,header = True)
    dfkk2.createOrReplaceTempView('y2')
    dfkk3 = sqlContext.read.csv(
        courtsszc_detail, header=True)
    dfkk3.createOrReplaceTempView('y3')
    # dfkk2.show()
    dfhh1 = sqlContext.sql(
        """select y2.company_name,y3.* from y2 left join y3 on y2.sc_data_id = y3.sc_data_id
         """)
    dfhh1.show()
    dfhh1.createOrReplaceTempView('hh1')
    dfkk4 = sqlContext.read.csv(
        cq_bank_1000, header=True)
    dfkk4.createOrReplaceTempView('y4')
    dfhh2 = sqlContext.sql(
        """select y4.company_name as company_name_a,hh1.* from y4 left join hh1 on y4.company_name = hh1.company_name
         """)
    dfhh2.show()
    dfhh2.repartition(1).write.csv("hdfs://192.168.31.10:9000/hdfs/tmp/statistic/cq_bank_courtsszc.csv",
                                   mode='overwrite', header=True,quoteAll = True)
    spark.stop()

def test_judgedoc(spark,sc):
    'litigant_name, doc_id,litigant_type,case_type,case_reason,publish_date,trail_date,case_result'
    judgedoc_sample_field = ['litigant_name','doc_id','litigant_type','case_type',
                             'case_reason','publish_date','trail_date','case_result']
    judgedoc_sample_schema = StructType(
    [StructField(field_name, StringType(), True) for field_name in judgedoc_sample_field])
    sqlContext = SQLContext(sparkContext=sc)
    dfkk2 = sqlContext.read.csv(
        judgedoc_company,header = False,sep = '\t')
    dfkk2.createOrReplaceTempView('y2')
    # print(dfkk2.columns)
    # spark.stop()
    dfkk4 = sqlContext.read.csv(
        cq_bank_1000, header=True)
    dfkk4.createOrReplaceTempView('y4')
    dfhh2 = sqlContext.sql(
        """select y4.company_name,y2._c0 as doc_id from y4 left join y2 on y4.company_name = y2._c1
         """)
    # dfhh2.show()
    dfhh2.createOrReplaceTempView('hh2')
    dfkk3 = sqlContext.read.csv(
        judgedoc_detail,quote="",schema=judgedoc_sample_schema,header=False,nanValue='nan',
    nullValue='nan',inferSchema=True)
    print(dfkk3.columns)
    print(type(dfkk3))
    # print(dfkk3[1:3])
    spark.stop()
    # dfkk3 = dfkk3 \
    #     .filter(lambda line: len((line["value"] or "").split(',')) == 8) \
    #     .map(lambda line: Row(**dict(zip(judgedoc_sample_field, line["value"].split(','))))) \
    #     .toDF(judgedoc_sample_schema)
    #     #  \

    dfkk3.createOrReplaceTempView('y3')
    # dfkk2.show()
    dfhh1 = sqlContext.sql(
        """select hh2.company_name,y3.* from hh2 left join y3 on hh2.doc_id = y3.doc_id
         """)
    # dfhh1.show()
    dfhh1.repartition(1).write.csv("hdfs://192.168.31.10:9000/hdfs/tmp/statistic/cq_bank_judgedoc.csv",
                                   mode='overwrite', header=True,quoteAll = True,sep = '\t')
    spark.stop()

def test_shixin(spark,sc):
    sqlContext = SQLContext(sparkContext=sc)
    dfkk2 = sqlContext.read.csv(
        shixin_company,header = True)
    dfkk2.createOrReplaceTempView('y2')
    dfkk4 = sqlContext.read.csv(
        cq_bank_1000, header=True)
    dfkk4.createOrReplaceTempView('y4')
    dfhh2 = sqlContext.sql(
        """select y4.company_name as company_name_a,y2.* from y4 left join y2 on y4.company_name = y2.company_name
         """)
    dfhh2.show()
    dfhh2.repartition(1).write.csv("hdfs://192.168.31.10:9000/hdfs/tmp/statistic/cq_bank_shixin.csv",
                                   mode='overwrite', header=True, quoteAll = True )
    spark.stop()

def test_punish(spark,sc):
    sqlContext = SQLContext(sparkContext=sc)
    dfkk2 = sqlContext.read.csv(
        punish_litigants,header = True)
    dfkk2.createOrReplaceTempView('y2')
    dfkk4 = sqlContext.read.csv(
        cq_bank_1000, header=True)
    dfkk4.createOrReplaceTempView('y4')
    dfhh3 = sqlContext.sql(
        """select company_name,y2.sc_data_id from y4 left join y2 on y4.company_name = y2.name
         """)
    dfhh3.createOrReplaceTempView('y3')
    dfkk6 = sqlContext.read.csv(
        punish, header=True)
    dfkk6.createOrReplaceTempView('y6')
    dfhh5 = sqlContext.sql(
        """select company_name,y6.* from y3 left join y6 on y3.sc_data_id = y6.sc_data_id
         """)

    dfhh5.show()
    dfhh5.repartition(1).write.csv("hdfs://192.168.31.10:9000/hdfs/tmp/statistic/cq_bank_punish.csv",
                                   mode='overwrite', header=True, quoteAll = True )
    spark.stop()

def test_trademark(spark,sc):
    sqlContext = SQLContext(sparkContext=sc)
    dfkk2 = sqlContext.read.csv(
        trademark,header = True)
    dfkk2.createOrReplaceTempView('y2')
    dfkk4 = sqlContext.read.csv(
        cq_bank_1000, header=True)
    dfkk4.createOrReplaceTempView('y4')
    dfhh3 = sqlContext.sql(
        """select y4.company_name as company_name_a,y2.* from y4 left join y2 on y4.company_name = y2.company_name
         """)
    dfhh3.repartition(1).write.csv("hdfs://192.168.31.10:9000/hdfs/tmp/statistic/cq_bank_trademark.csv",
                                   mode='overwrite', header=True, quoteAll = True )
    spark.stop()

#
if __name__ == '__main__':

    spark = SparkSession.builder.master(sparkpath).appName("SC_ETL_ccs_spark") \
        .getOrCreate()
    # spark.conf.set("spark.driver.maxResultSize", "4g")
    # spark.conf.set("spark.sql.broadcastTimeout", 1200)
    # spark.conf.set("spark.sql.crossJoin.enabled", "true")
    # spark.conf.set("spark.cores.max", "10")
    sc = spark.sparkContext
    # test_courtsszc(spark, sc)
    # test_judgedoc(spark, sc)
    # test_shixin(spark, sc)
    # test_punish(spark, sc)
    test_trademark(spark, sc)

對於儲存的結果表,用csv開啟後為亂碼.可以儲存的時候設定引數:quoteAll = True;就是
引數詳細解釋:

對於裁判文書:
dfkk3 = sqlContext.read.csv(
judgedoc_detail,quote="",schema=judgedoc_sample_schema,header=False,nanValue=‘nan’,
nullValue=‘nan’,inferSchema=True)
讀表的時候有問題,這樣設定引數還是有問題;

相關文章