實戰案例:醫療臨床大資料實時流日誌分析

-狼發表於2019-02-13

2.6 spark實戰案例:實時日誌分析

  • 2.6.1 互動流程圖

在這裡插入圖片描述

  • 2.6.2 客戶端監聽器(java)
    1 對his系統零侵入,醫療his log4j把日誌切入到檔案,為了保證中文完整性(socket傳輸、python解碼)中文字元采用base64 encode
    2 spark stream原生支援監聽資料夾的變化,但與業務系統同網段跨機器,考慮到amout掛載共享、linux samba檔案服務,但共享涉及對業務系統機器的更新以及共享檔案檔案控制
    3 spark stream 支援kaffka日誌監聽,業務系統訊息採用rabbitMq,綜上2、3採用socket客戶端監聽
    4 socket資料採集:保證業務系統日誌完整性-需要熟悉醫療his業務日誌,多執行緒獲取日誌-保證一條完整的日誌不能被切分
    5 java io包括aio、nio、bio,經過測試spark stream receiver執行緒目前版本僅支援bio
@SuppressWarnings("static-access")
	private void handleSocket() {
		lock.lock();
		Writer writer = null;
		RandomAccessFile raf = null;
		
		try {
			File file = new File(filepath);
			raf = new RandomAccessFile(file, "r");
			raf.seek(pointer);
			writer = new OutputStreamWriter(socket.getOutputStream(), "UTF-8");
			
			String line = null;
			
			while ((line = raf.readLine()) != null) {
				if (Strings.isBlank(line)) {
					continue;
				}

				line = new String(line.getBytes("ISO-8859-1"), "UTF-8");
				writer.write(line.concat("\n"));
				writer.flush();
				logger.info("執行緒:{}----起始位置:{}----讀取檔案\n{} :",Thread.currentThread().getName(), pointer, line);
				
				pointer = raf.getFilePointer();
			}
			Thread.currentThread().sleep(2000);
		} catch (Exception e) {
			logger.error(e.getMessage());
			e.printStackTrace();
		} finally {
			lock.unlock();
			fclose(writer, raf);
		}

	}
  • 2.6.3 sparkStream實時資料接收(python)

spark conf全域性配置:

conf = SparkConf()
conf.setAppName("HIS實時日誌分析")
conf.setMaster('yarn') # spark standalone
conf.set('spark.executor.instances', 8) # cluster on yarn
conf.set('spark.executor.memory', '1g')
conf.set('spark.executor.cores', '1')
# conf.set('spark.cores.max', '2')
# conf.set('spark.logConf', True)
conf.set('spark.streaming.blockInterval', 1000*4)  # restart receiver interval

初始化spark stream:

sc = SparkContext(conf = conf)
sc.setLogLevel('ERROR')
    
ssc = StreamingContext(sc, 30)  # time interval at which splits streaming data into block

lines = ssc.socketTextStream(str(ip), int(port))
# lines.pprint()
lines.foreachRDD(requestLog)
lines.foreachRDD(errorLog)
ssc.start()
ssc.awaitTermination()

設定RDD checkpoint,hdfs資料備份恢復
sc.setCheckpointDir(‘hdfs://hadoop01:9000/hadoop/upload/checkpoint/’)

  • 2.6.4 單例spark session
 def getSparkSessionInstance(sparkConf):
    '''
    :@desc 多個RDD全域性共享sparksession
     .config("spark.mongodb.input.uri", "mongodb://127.0.0.1/test.coll") \
     .config("spark.mongodb.output.uri", "mongodb://adxkj:123456@192.168.0.252:27017/") \
    :param sparkConf:
    :return:
    '''
    if ('sparkSessionSingletonInstance' not in globals()):
        globals()['sparkSessionSingletonInstance'] = SparkSession \
            .builder \
            .config(conf=sparkConf) \
            .getOrCreate()
    return globals()['sparkSessionSingletonInstance']
  • 2.6.5 python 處理工具
def timeFomate(x):
    '''
    :@desc 處理時間
    :param x:
    :return:
    '''
    if not isinstance(x, list):
        return None

    # filter microsenconds
    x.insert(0, ' '.join(x[0:2]))
    x.pop(1)
    x.pop(1)

    # filter '[]'
    rx = re.compile('([\[\]\',])')
    # text = rx.sub(r'\\\1', text)
    x = [rx.sub(r'', x[i]) for i in range(len(x))]

    # string to time
    x[0] = x[0][: x[0].find('.')]
    x[0] = ''.join(x[0])
    x[0] = datetime.strptime(x[0], '%Y-%m-%d %H:%M:%S')

    return x

def analyMod(x) :
    '''
    :@desc 通過uri匹配模組
    :param x:
    :return:
    '''
    if x[6].strip() == ' ':
        return None

    hasMatch = False

    for k, v in URI_MODULES.items() :
        if x[6].strip().startswith('/' + k) :
            hasMatch = True
            x.append(v)

    if not hasMatch:
        x.append('公共模組')

    return x
    
#python解碼java編碼中文字元    
def decodeStr(x) :
    '''
    :@desc base64解碼
    :param x:
    :return:
    '''

    try:
        if x[9].strip() != '' :
            x[9] = base64.b64decode(x[9].encode("utf-8")).decode("utf-8")
            # x[9] = x[9][:5000] #mysql

        if x[11].strip() != '':
            x[11] = base64.b64decode(x[11].encode("utf-8")).decode("utf-8")
            # x[11] = x[11][:5000] #mysql

        if len(x) > 12 and x[12].strip() != '':
            x[12] = base64.b64decode(x[12].encode("utf-8")).decode("utf-8")

    except Exception as e:
        print("不能解碼:", x, e)

    return x
  • 2.6.6 sparkSQL儲存mysql
def sqlMysql(sqlResult, table, url="jdbc:mysql://192.168.0.252:3306/hisLog", user='root', password=""):
    '''
    :@desc sql結果儲存
    :param sqlResult:
    :param table:
    :param url:
    :param user:
    :param password:
    :return:
    '''
    try:
        sqlResult.write \
            .mode('append') \
            .format("jdbc") \
            .option("url", url) \
            .option("dbtable", table) \
            .option("user", user) \
            .option("password", password) \
            .save()
    except:
        excType, excValue, excTraceback = sys.exc_info()
        traceback.print_exception(excType, excValue, excTraceback, limit=3)
        # print(excValue)
        # traceback.print_tb(excTraceback)
  • 2.6.7 sparkSQL儲存mongoDB
    備註:請把java-connector-mongodb、spark-mongodb jar放到spark classpath
def sqlMongodb(sqlResult, table):
    '''
    :@desc sql結果儲存
    :param sqlResult:
    :param table:
    :param url:
    :param user:
    :param password:
    :return:
    '''
    try:
        sqlResult.\
            write.\
            format("com.mongodb.spark.sql.DefaultSource"). \
            options(uri="mongodb://adxkj:123456@192.168.0.252:27017/hislog",
                    database="hislog", collection=table, user="adxkj", password="123456").\
            mode("append").\
            save()
    except:
        excType, excValue, excTraceback = sys.exc_info()
        traceback.print_exception(excType, excValue, excTraceback, limit=3)
        # print(excValue)
        # traceback.print_tb(excTraceback)
  • 2.6.8 RDD transform->action
def fromRDD():
reqrdd = rdd.map(lambda x: x.split(' ')).\
        filter(lambda x: len(x) > 12 and x[4].find('http-nio-') > 0 and x[2].strip() == 'INFO').\
        filter(lambda x: x[8].strip().upper().startswith('POST') or x[8].strip().upper().startswith('GET')).\
        map(timeFomate).\
        map(decodeStr).\
        map(analyMod)
  • 2.6.9 RDD轉為dataframe、sparkSQL結構化查詢
def constructSQL(reqrdd):
      sqlRdd = reqrdd.map(lambda x: Row(time=x[0], level=x[1], clz=x[2], thread=x[3], user=x[4], depart=x[5],
                          uri=x[6], method=x[7], ip=x[8], request=x[9], oplen=x[10],
                          respone=x[11], mod=x[12]))
    if reqrdd.isEmpty():
        return None
    
    spark = getSparkSessionInstance(rdd.context.getConf())
    df = spark.createDataFrame(sqlRdd)
    df.createOrReplaceTempView(REQUEST_TABLE)

    # 結構化後再分析
    sqlresult = spark.sql("SELECT * FROM " + REQUEST_TABLE)
    sqlresult.show()
    return sqlresult
  • 2.6.10 RDD持久化
#快取到記憶體
reqrdd.cache()

# rdd持久化到硬碟,降低記憶體消耗, cache onliy for StorageLevel.MEMORY_ONLY
# reqrdd.persist(storageLevel=StorageLevel.MEMORY_AND_DISK_SER)

#設定檢查點,資料可從hdfs恢復
reqrdd.checkpoint()  # checkpoint先cache避免計算兩次,以前的rdd銷燬
  • 2.6.11 日誌分析流程
def requestLog(time, rdd):
    '''
    :@desc 請求日誌分析
    :param time:
    :param rdd:
    :return:
    '''
    logging.info("+++++handle request log:length:%d,獲取內容:++++++++++" % (rdd.count()))
    if rdd.isEmpty():
        return None
    logging.info("++++++++++++++++++++++處理requestLog+++++++++++++++++++++++++++++++")
   fromRDD(rdd)
   
  sqlresult = constructSQL(reqrdd)

    # 儲存
    sqlMongodb(sqlresult, REQUEST_TABLE)

備註:需要完整程式碼請聯絡作者@狼

相關文章