flink版本: 1.14.6 flink水位生成以及基於水位觸發視窗的計算

A1340399426發表於2024-06-20

Flink是間斷性(punctuate)或者週期性(periodic)生成水位線的
1. 定義和用途
* punctuate:為每條訊息都嘗試生成watermark,這提供了更細粒度的控制,但增加了不必要的計算開銷
* periodic:週期性的生成watermark,可以透過env.getConfig().setAutoWatermarkInterval(1 * 1000L)設定週期間隔,預設是200ms。這樣更加高效,並且對於大多數場景已經夠用
2. 社群趨勢
* punctuate已經過期,
* periodic:通常被作為推薦的方式來生成watermark

點選檢視程式碼
package flink.shangguigu_test;

import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONObject;
import org.apache.flink.api.common.eventtime.SerializableTimestampAssigner;
import org.apache.flink.api.common.eventtime.TimestampAssigner;
import org.apache.flink.api.common.eventtime.TimestampAssignerSupplier;
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.api.common.functions.AggregateFunction;
import org.apache.flink.api.common.functions.FilterFunction;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.common.restartstrategy.RestartStrategies;
import org.apache.flink.api.common.serialization.SimpleStringSchema;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.connector.kafka.source.KafkaSource;
import org.apache.flink.connector.kafka.source.enumerator.initializer.OffsetsInitializer;
import org.apache.flink.runtime.state.hashmap.HashMapStateBackend;
import org.apache.flink.streaming.api.CheckpointingMode;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.AssignerWithPunctuatedWatermarks;
import org.apache.flink.streaming.api.functions.ProcessFunction;
import org.apache.flink.streaming.api.functions.timestamps.BoundedOutOfOrdernessTimestampExtractor;
import org.apache.flink.streaming.api.watermark.Watermark;
import org.apache.flink.streaming.api.windowing.assigners.TumblingEventTimeWindows;
import org.apache.flink.streaming.api.windowing.assigners.TumblingProcessingTimeWindows;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.streaming.api.windowing.triggers.EventTimeTrigger;
import org.apache.flink.streaming.api.windowing.triggers.ProcessingTimeTrigger;
import org.apache.flink.streaming.runtime.operators.util.AssignerWithPeriodicWatermarksAdapter;
import org.apache.flink.streaming.runtime.operators.util.AssignerWithPunctuatedWatermarksAdapter;
import org.apache.flink.util.Collector;

import java.time.Duration;

public class Tublingwindow_pv {
    private static final String KAFKA_SERVER = " 10.210.44.10:9092,10.210.44.33:9092,10.210.44.17:9092";
    private static final String KAFKA_TOPIC = "flink_test";
    private static final String KAFKA_GROUP_ID = "flink_test_consumer_group_id_001";


    public static void main(String[] args) throws Exception {
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        env.setParallelism(1);
        env.getConfig().setAutoWatermarkInterval(1L); //每個一分鐘分配一條水位線,預設是200毫秒 設定為1,是為了讓每一條資料都生成watermark,便於分析結果資料


        env.enableCheckpointing(5* 1000, CheckpointingMode.EXACTLY_ONCE);
        env.setRestartStrategy(RestartStrategies.fixedDelayRestart(3*1000,5* 1000));
        env.setStateBackend(new HashMapStateBackend());
        env.getCheckpointConfig().setMaxConcurrentCheckpoints(1);
        env.getCheckpointConfig().setCheckpointStorage("hdfs://hadoop-ha/user/flink/cluster_yarn/checkpoints");










        DataStream<String> input = env.socketTextStream("127.0.0.1", 8085).filter(new FilterFunction<String>() {
            @Override
            public boolean filter(String value) throws Exception {
                return value != null && value != "";
            }
        });

        SingleOutputStreamOperator<String> assign_event_time = input
                .assignTimestampsAndWatermarks(
                        WatermarkStrategy
                                .<String>forBoundedOutOfOrderness(Duration.ofSeconds(2))
                                .withTimestampAssigner(new SerializableTimestampAssigner<String>() {
                                    @Override
                                    public long extractTimestamp(String element, long recordTimestamp) {
                                        JSONObject jsonObject = JSON.parseObject(element);
                                        Long operate_time = jsonObject.getLong("operate_time");
                                        return operate_time * 1000L;
                                    }
                                })
                );


        SingleOutputStreamOperator<Tuple2<String,Long>> process = assign_event_time.process(new ProcessFunction<String, Tuple2<String,Long>>() {
            @Override
            public void processElement(String value, Context ctx, Collector<Tuple2<String,Long>> out) throws Exception {

               Long watermark =  ctx.timerService().currentWatermark();
                out.collect(Tuple2.of("時間戳:" + JSON.parseObject(value).getLong("operate_time") +
                        " 事件時間:" + ctx.timestamp() +
                        " 上一條水位:" + ctx.timerService().currentWatermark() +
                        " 當前水位:" + (ctx.timestamp() - 2000 -1 >  watermark ? ctx.timestamp() - 2000 -1 : watermark)
                        ,1L)
                );

            }


        });


        SingleOutputStreamOperator<String> aggregate = process.windowAll(TumblingEventTimeWindows.of(Time.seconds(2))).trigger(EventTimeTrigger.create()).aggregate(new AggregateFunction<Tuple2<String, Long>, Tuple2<StringBuilder,Long>, String>() {
            @Override
            public Tuple2<StringBuilder, Long> createAccumulator() {
                return Tuple2.of(new StringBuilder(),0L);
            }

            @Override
            public Tuple2<StringBuilder, Long> add(Tuple2<String, Long> value, Tuple2<StringBuilder, Long> accumulator) {
                return Tuple2.of(accumulator.f0.append(value.f0).append("-->\n"),value.f1 + accumulator.f1);
            }

            @Override
            public String getResult(Tuple2<StringBuilder, Long> accumulator) {
                return accumulator.f0 + "==" + accumulator.f1;
            }

            @Override
            public Tuple2<StringBuilder, Long> merge(Tuple2<StringBuilder, Long> a, Tuple2<StringBuilder, Long> b) {
                return null;
            }
        });



        aggregate.print();
        env.execute();


    }
}

樣例資料

{"UUID":"","operate_time":"1718718279"}

{"UUID":"","operate_time":"1718718279"}

{"UUID":"","operate_time":"1718718280"}

{"UUID":"","operate_time":"1718718280"}

{"UUID":"","operate_time":"1718718281"}

{"UUID":"","operate_time":"1718718285"}

{"UUID":"","operate_time":"1718718280"}

{"UUID":"","operate_time":"1718718279"}

執行後的結果:

時間戳:1718718260 事件時間:1718718260000 上一條水位:-9223372036854775808 當前水位:1718718257999-->
==1
時間戳:1718718279 事件時間:1718718279000 上一條水位:1718718257999 當前水位:1718718276999-->
時間戳:1718718279 事件時間:1718718279000 上一條水位:1718718276999 當前水位:1718718276999-->
==2
時間戳:1718718280 事件時間:1718718280000 上一條水位:1718718276999 當前水位:1718718277999-->
時間戳:1718718280 事件時間:1718718280000 上一條水位:1718718277999 當前水位:1718718277999-->
時間戳:1718718281 事件時間:1718718281000 上一條水位:1718718277999 當前水位:1718718278999-->
==3
(時間戳:1718718285事件時間:1718718285000 水位:1718718278999 當前水位:1718718282999,1)

結論:
flink的初始化水位是Long.min_value = -9223372036854775808,後序自動生成的watermark必須是大於0的,比如你設定第一條的event_time,計算出來的watermark=-2001,這個值不會被保留,也就是你第二條資料列印出來的上一條watermark還是-9223372036854775808
Flink中使用的都是毫秒,抽取資料中的event_time(如果是秒)時要轉換成毫秒
水位線的計算方式是:按照當前最大時間戳-允許亂序的時間-1
在上述程式碼i中,watermark分配使用的是forBoundedOutOfOrderness,這個函式是週期性分配watermark,我們可以透過env.getConfig().setAutoWatermarkInterval(200L)設定水位週期性生成的時間間隔,預設是200ms。
watermark作為單獨的流資料在flink中流轉,ctx.timerService().currentWatermark()列印的是上一個水位值,所以我們要根據最新的event_time計算最新的水位值。水位值單調遞增的,如果計算出來最新的水位值比當前的水位值小,則拋棄,否則更新邏輯時鐘的水位值。
視窗的關閉和計算是根據最新的水位值判斷的。當watermark大於視窗的關閉時間則觸發視窗的執行。

相關文章