1.11.2 flinksql自定義sls connector 聯結器

菜到摳腳的cxy發表於2020-09-29

背景

需要連線阿里日誌服務sls,獲取日誌

阿里提供了對應的flink-log-connector 阿里雲文件

不過他提供的是flink stream api, 我們需要使用flinksql, 所以需要自定義編寫對應的table soruce

注意 

flink-log-connector一次獲取的是多條日誌格式RawLogGroupList, 想要更好的結合flinksql還可能需要去修改對應的原始碼

編碼

依賴

這裡用的自己checkout原始碼打包的

<dependency>
    <groupId>com.aliyun.openservices</groupId>
    <artifactId>flink-log-connector</artifactId>
    <version>0.1.24-SNAPSHOT</version>
</dependency>

format

編寫一個處理格式處理器

 SlsFormatFactory

public class SlsFormatFactory implements DecodingFormatFactory<LogDeserializationSchema<RowData>> {

    @Override
    public DecodingFormat<LogDeserializationSchema<RowData>> createDecodingFormat(DynamicTableFactory.Context context, ReadableConfig readableConfig) {
        // either implement your custom validation logic here ...
        // or use the provided helper method
        FactoryUtil.validateFactoryOptions(this, readableConfig);

        // create and return the format
        return new SlsFormat();
    }

    @Override
    public String factoryIdentifier() {
        return "sls";
    }

    @Override
    public Set<ConfigOption<?>> requiredOptions() {
        return Collections.emptySet();
    }

    @Override
    public Set<ConfigOption<?>> optionalOptions() {
        final Set<ConfigOption<?>> options = new HashSet<>();
        return options;
    }
}

SlsFormat

public class SlsFormat implements DecodingFormat<LogDeserializationSchema<RowData>> {

    @Override
    public LogDeserializationSchema<RowData> createRuntimeDecoder(DynamicTableSource.Context context, DataType dataType) {
        // create type information for the DeserializationSchema
        //建立反序列化schema
        final TypeInformation<RowData> producedTypeInfo = (TypeInformation<RowData>) context.createTypeInformation(
                dataType);

        // most of the code in DeserializationSchema will not work on internal data structures
        // create a converter for conversion at the end
        final DynamicTableSource.DataStructureConverter converter = context.createDataStructureConverter(dataType);

        // use logical types during runtime for parsing
        final List<LogicalType> parsingTypes = dataType.getLogicalType().getChildren();

        // create runtime class
        return new SlsDeserializer(parsingTypes, converter, producedTypeInfo);
    }

    @Override
    public ChangelogMode getChangelogMode() {
        // define that this format can produce INSERT and DELETE rows
        return ChangelogMode.newBuilder()
                .addContainedKind(RowKind.INSERT)
                .build();
    }
}

SlsDeserializer

public class SlsDeserializer implements LogDeserializationSchema<RowData> {
    private final List<LogicalType> parsingTypes;
    private final DynamicTableSource.DataStructureConverter converter;
    private final TypeInformation<RowData> producedTypeInfo;

    public SlsDeserializer(List<LogicalType> parsingTypes, DynamicTableSource.DataStructureConverter converter, TypeInformation<RowData> producedTypeInfo) {
        this.parsingTypes = parsingTypes;
        this.converter = converter;
        this.producedTypeInfo = producedTypeInfo;
    }

    @Override
    public TypeInformation<RowData> getProducedType() {
        return producedTypeInfo;
    }

    @Override
    public RowData deserialize(List<LogGroupData> logGroups) {
        //在這裡把sls consumer接收到的資料解析,轉成RowData,內容就是一個json字串
        //然後flink-log-connector中的com.aliyun.openservices.log.flink.model.LogDataFetcher中emitRecordAndUpdateState把RowData轉成多個RowData
        List<Map<String, String>> collect = logGroups.stream()
                .map(LogGroupData::GetFastLogGroup)
                .map(FastLogGroup::getLogs)
                .flatMap(Collection::stream)
                .map(fastLog -> {
                    int count = fastLog.getContentsCount();
                    Map<String, String> log = new HashMap<>();
                    for (int cIdx = 0; cIdx < count; ++cIdx) {
                        FastLogContent content = fastLog.getContents(cIdx);
                        log.put(content.getKey(), content.getValue());
                    }
                    return log;
                }).collect(Collectors.toList());
//        ArrayList<RawLog> rawLogs = new ArrayList<>();
//        ArrayList<RawLog> rawLogs = new ArrayList<>();
//        for (LogGroupData logGroup : logGroups) {
//            FastLogGroup flg = logGroup.GetFastLogGroup();
//            for (int lIdx = 0; lIdx < flg.getLogsCount(); ++lIdx) {
//                FastLog log = flg.getLogs(lIdx);
//                RawLog rlog = new RawLog();
//                rlog.setTime(log.getTime());
//                for (int cIdx = 0; cIdx < log.getContentsCount(); ++cIdx) {
//                    FastLogContent content = log.getContents(cIdx);
//                    rlog.addContent(content.getKey(), content.getValue());
//                }
//                rawLogs.add(rlog);
//            }
//        }
        final RowKind kind = RowKind.valueOf("INSERT");
        final Row row = new Row(kind, parsingTypes.size());
//        Row row = new Row(1);
        row.setField(0, JSONObject.toJSONString(collect));
        return (RowData) converter.toInternal(row);
    }
}

flinksql connector

SlsDynamicTableSourceFactory

public class SlsDynamicTableSourceFactory implements DynamicTableSourceFactory {

    public static final ConfigOption<String> PROJECT = ConfigOptions.key("project").stringType().noDefaultValue();
    public static final ConfigOption<String> ACCESS_ID = ConfigOptions.key("access.id").stringType().noDefaultValue();
    public static final ConfigOption<String> ACCESS_KEY = ConfigOptions.key("access.key").stringType().noDefaultValue();
    public static final ConfigOption<String> ENDPOINT = ConfigOptions.key("endpoint").stringType().noDefaultValue();
    public static final ConfigOption<String> LOGSTORE = ConfigOptions.key("logstore").stringType().noDefaultValue();
    public static final ConfigOption<String> CONSUMER_BEGINPOSITION = ConfigOptions.key("consumer.beginposition").stringType().noDefaultValue();
    public static final ConfigOption<String> FORMAT = ConfigOptions.key("format").stringType().noDefaultValue();

    public SlsDynamicTableSourceFactory() {}

    public String factoryIdentifier() {
        return "sls";
    }

    @Override
    public Set<ConfigOption<?>> requiredOptions() {
        Set<ConfigOption<?>> options = new HashSet();
        options.add(PROJECT);
        options.add(ACCESS_ID);
        options.add(ACCESS_KEY);
        options.add(ENDPOINT);
        options.add(LOGSTORE);
        options.add(CONSUMER_BEGINPOSITION);
        options.add(FORMAT);
        return options;
    }

    @Override
    public Set<ConfigOption<?>> optionalOptions() {
        return new HashSet();
    }

    @Override
    public DynamicTableSource createDynamicTableSource(Context context) {
        // either implement your custom validation logic here ...
        // or use the provided helper utility
        final FactoryUtil.TableFactoryHelper helper = FactoryUtil.createTableFactoryHelper(this, context);

        // discover a suitable decoding format
        final DecodingFormat<LogDeserializationSchema<RowData>> decodingFormat = helper.discoverDecodingFormat(
                DecodingFormatFactory.class,
                FactoryUtil.FORMAT);

        // validate all options
        helper.validate();
        TableSchema schema = context.getCatalogTable().getSchema();
        // get the validated options
        final ReadableConfig options = helper.getOptions();
        String project = options.get(PROJECT);
        String accessId = options.get(ACCESS_ID);
        String accessKey = options.get(ACCESS_KEY);
        String endpoint = options.get(ENDPOINT);
        String logstore = options.get(LOGSTORE);
        String consumerBeginposition = options.get(CONSUMER_BEGINPOSITION);

        // derive the produced data type (excluding computed columns) from the catalog table
        final DataType producedDataType = context.getCatalogTable().getSchema().toPhysicalRowDataType();
        return new SlsDynamicTableSource(project,accessId,accessKey,endpoint,logstore,consumerBeginposition,decodingFormat, producedDataType,schema);
    }
}

SlsDynamicTableSource

public class SlsDynamicTableSource implements ScanTableSource {
    private String project;
    private String accessId;
    private String accessKey;
    private String endpoint;
    private String logstore;
    private String consumerBeginposition;
    private DecodingFormat<LogDeserializationSchema<RowData>> decodingFormat;
    private DataType producedDataType;
    private TableSchema schema;


    public SlsDynamicTableSource(String project, String accessId, String accessKey, String endpoint, String logstore, String consumerBeginposition,
                                 DecodingFormat<LogDeserializationSchema<RowData>> decodingFormat, DataType producedDataType,
                                 TableSchema schema
    ) {
        this.project = project;
        this.accessId = accessId;
        this.accessKey = accessKey;
        this.endpoint = endpoint;
        this.logstore = logstore;
        this.consumerBeginposition = consumerBeginposition;
        this.decodingFormat = decodingFormat;
        this.producedDataType = producedDataType;
        this.schema = schema;
    }

    @Override
    public ChangelogMode getChangelogMode() {
        return ChangelogMode.newBuilder()
                .addContainedKind(RowKind.INSERT)
                .build();
    }

    @Override
    public ScanRuntimeProvider getScanRuntimeProvider(ScanContext scanContext) {
        // create runtime classes that are shipped to the cluster

        final LogDeserializationSchema<RowData> deserializer = decodingFormat.createRuntimeDecoder(
                scanContext,
                producedDataType);

        //逗號切割logstores名字
        List<String> topics = Arrays.asList(this.logstore.split(","));
        Properties slsProperties = new Properties();
        // 設定訪問日誌服務的域名
        slsProperties.put(ConfigConstants.LOG_ENDPOINT, this.endpoint);
        // 設定訪問ak
        slsProperties.put(ConfigConstants.LOG_ACCESSSKEYID, this.accessId);
        slsProperties.put(ConfigConstants.LOG_ACCESSKEY, this.accessKey);
        // 設定消費日誌服務起始位置
        /**
         * begin_cursor, end_cursor, unix timestamp or consumer_from_checkpoint
         */
        slsProperties.put(ConfigConstants.LOG_CONSUMER_BEGIN_POSITION, this.consumerBeginposition);
//        /**
//         * 消費組名
//         */
//        slsProperties.put(ConfigConstants.LOG_CONSUMERGROUP, "flink-consumer-test");
//        slsProperties.put(ConfigConstants.LOG_FETCH_DATA_INTERVAL_MILLIS, 3000);
//        slsProperties.put(ConfigConstants.LOG_MAX_NUMBER_PER_FETCH, 10);
//        /**
//         * DISABLED---Never commit checkpoint to remote server.
//         * ON_CHECKPOINTS---Commit checkpoint only when Flink creating checkpoint, which means Flink
//         *                  checkpointing must be enabled.
//         * PERIODIC---Auto commit checkpoint periodic.
//         */
//        slsProperties.put(ConfigConstants.LOG_CHECKPOINT_MODE, CheckpointMode.ON_CHECKPOINTS.name());
//        /**
//         * 應該是如果ConfigConstants.LOG_CHECKPOINT_MODE設定了CheckpointMode.PERIODIC,則可以設定自動提交間隔
//         * slsProperties.put(ConfigConstants.LOG_COMMIT_INTERVAL_MILLIS, "10000");
//         */

        FlinkLogConsumer<RowData> flinkLogConsumer = new FlinkLogConsumer<>(project, topics, (LogDeserializationSchema) deserializer, slsProperties);
        return SourceFunctionProvider.of(flinkLogConsumer, false);
    }

    @Override
    public DynamicTableSource copy() {
        return new SlsDynamicTableSource(project,accessId,accessKey,endpoint,logstore,consumerBeginposition,null, producedDataType,schema);
    }

    @Override
    public String asSummaryString() {
        return "sls Table Source";
    }

}

流程

FlinkLogConsumer接收到的資料格式是List<LogGroupData>, 然後回進入我們寫的SlsDeserializer的deserialize方法, 在這個方法中我把List<LogGroupData>轉成一個RowData, RowData只有一個欄位, 這個欄位內容是所有日誌的json格式.

到了這裡其實就可以接收資料了, 在ddl建立soruce的時候, 每個RowData的第一個欄位是一組log的json字串, 後面可能需要使用udtf解析每個RowData.

比如下面的sql

CREATE FUNCTION ParseUriRow AS 'flinksql.function.udtf.ParseUriRow';
CREATE TABLE sourceTable (
    request_uri STRING
) WITH (
    'connector.type' = 'sls',
    'connector.endpoint' = '',
    'connector.project' = '',
    'connector.access.id' = '',
    'connector.access.key' = '',
    'connector.logstore' = '',
    'connector.consumer.beginposition' = '1585670400'
);
CREATE TABLE sinktable (
    platform STRING,
    aaaa STRING,
    bbbb STRING
) WITH (
    'connector.type' = 'print'
);
insert into sinktable
 select
    platform
 from sourceTable, LATERAL TABLE(ParseUriRow(request_uri)) as T(
        aaaa,
        bbbb
     )
 where 1 = POSITION('/log.gif?' IN request_uri);

不過這樣的話,每次都要寫udtf

所以這裡我修改了一下checkout 阿里雲的flink-log-connector原始碼

https://github.com/aliyun/aliyun-log-flink-connector.git

修改com.aliyun.openservices.log.flink.model.LogDataFetcher的emitRecordAndUpdateState

在這個方法裡面解析了資料返回多個RowData

void emitRecordAndUpdateState(T record, long recordTimestamp, int shardStateIndex, String cursor) {
        synchronized (checkpointLock) {
//            sourceContext.collectWithTimestamp(record, recordTimestamp);
            GenericRowData genericRowData = (GenericRowData)record;
            BinaryStringData binaryStringData = (BinaryStringData)genericRowData.getString(0);
            String str = binaryStringData.getJavaObject();
            JSONArray objects = JSONObject.parseArray(str);
            for (int i = 0; i < objects.size(); i++) {
                JSONObject jsonObject = objects.getJSONObject(i);
                RowDataTypeInfo rowTypeInfo = (RowDataTypeInfo)deserializer.getProducedType();
                String[] fieldNames = rowTypeInfo.getFieldNames();
                int fsize = fieldNames.length;
                GenericRowData oneRow = new GenericRowData(fsize);
                for (int j = 0; j < fsize; j++) {
                    oneRow.setField(j,new BinaryStringData((String) jsonObject.get(fieldNames[j])));
                }
                sourceContext.collectWithTimestamp((T)oneRow, recordTimestamp);
            }
            LogstoreShardState state = subscribedShardsState.get(shardStateIndex);
            state.setOffset(cursor);
            if (state.hasMoreData()) {
                return;
            }
            if (this.numberOfActiveShards.decrementAndGet() == 0) {
                LOG.info("Subtask {} has reached the end of all currently subscribed shards; marking the subtask as temporarily idle ...",
                        indexOfThisSubtask);
                sourceContext.markAsTemporarilyIdle();
            }
        }
    }

最終使用類似如下

CREATE TABLE sourceTable (
    platform STRING,
    aaaa STRING,
    bbbb STRING
) WITH (
    'connector.type' = 'sls',
    'connector.endpoint' = '',
    'connector.project' = '',
    'connector.access.id' = '',
    'connector.access.key' = '',
    'connector.logstore' = '',
    'connector.consumer.beginposition' = '1585670400'
);
CREATE TABLE sinktable (
    platform STRING,
    aaaa STRING,
    bbbb STRING
) WITH (
    'connector.type' = 'print'
);
insert into sinktable
 select
    platform,
    aaaa,
    bbbb
 from sourceTable
 where 1 = POSITION('/log.gif?' IN request_uri);
 

 

相關文章