作者:石蓓蓓
愛可生研發工程師,主要負責愛可生產品雲DMP樹產品的研發工作。
本文來源:原創投稿
*愛可生開源社群出品,原創內容未經授權不得隨意使用,轉載請聯絡小編並註明來源。
AlertManager 是處理對應用程式的告警的,比如Promethus的服務端。對於輸入的告警,會經過分組、抑制、靜默、去重等步驟,最終並將告警傳送到接受者(郵箱等)。
alertManager 的框架圖如下:
今天主要是分享 AlertManager 中 Pipeline 相關的流程程式碼,pipeline 主要是用來處理分組後的告警,經過抑制、靜默、去重,然後傳送。
首先在建立 Pipeline 的時候,會建立 GossipSettleStage 、MuteStage(包含抑制和靜默)、WaitStage 、DedupStage 、RetryStage 、SetNotifiesStage 。
// New returns a map of receivers to Stages.
func (pb *PipelineBuilder) New(
receivers map[string][]Integration,
wait func() time.Duration,
inhibitor *inhibit.Inhibitor,
silencer *silence.Silencer,
notificationLog NotificationLog,
peer *cluster.Peer,
) RoutingStage {
rs := make(RoutingStage, len(receivers))
ms := NewGossipSettleStage(peer)
is := NewMuteStage(inhibitor)
ss := NewMuteStage(silencer)
for name := range receivers {
st := createReceiverStage(name, receivers[name], wait, notificationLog, pb.metrics)
rs[name] = MultiStage{ms, is, ss, st}
}
return rs
}
// createReceiverStage creates a pipeline of stages for a receiver.
func createReceiverStage(
name string,
integrations []Integration,
wait func() time.Duration,
notificationLog NotificationLog,
metrics *metrics,
) Stage {
var fs FanoutStage
for i := range integrations {
recv := &nflogpb.Receiver{
GroupName: name,
Integration: integrations[i].Name(),
Idx: uint32(integrations[i].Index()),
}
var s MultiStage
s = append(s, NewWaitStage(wait))
s = append(s, NewDedupStage(&integrations[i], notificationLog, recv))
s = append(s, NewRetryStage(integrations[i], name, metrics))
s = append(s, NewSetNotifiesStage(notificationLog, recv))
fs = append(fs, s)
}
return fs
}
從上面的程式碼可以看到 AlertManager 在某一通道處理時會經過 GossipSettleStage 、MuteStage(包含抑制和靜默)、WaitStage 、DedupStage 、RetryStage 、SetNotifiesStage 這7個 stage ,並且順序執行。
Pipeline 的執行是遍歷了所有的 stage ,每次執行 Exec 方法(見程式碼的第8行),且每次執行後返回的 alert 列表是下一步的引數(第8行的程式碼對傳入的引數alerts賦予新的告警值,再下次執行Exec的時候傳入的alerts的值是新的值),最終得到的alert列表是經過每次過濾後的告警列表
func (ms MultiStage) Exec(ctx context.Context, l log.Logger, alerts ...*types.Alert) (context.Context, []*types.Alert, error) {
var err error
for _, s := range ms {
if len(alerts) == 0 {
return ctx, nil, nil
}
ctx, alerts, err = s.Exec(ctx, l, alerts...)
if err != nil {
return ctx, nil, err
}
}
return ctx, alerts, nil
}
GossipSettle
等待叢集準備完畢。
func (n *GossipSettleStage) Exec(ctx context.Context, l log.Logger, alerts ...*types.Alert) (context.Context, []*types.Alert, error) {
if n.peer != nil {
n.peer.WaitReady()
}
return ctx, alerts, nil
}
Inhibitor 抑制
抑制首先是會執行MuteStage的Exec,再匹配到後,就不會傳送告警。主要是執行第6行的n.muter.Mutes方法來進行匹配:
func (n *MuteStage) Exec(ctx context.Context, l log.Logger, alerts ...*types.Alert) (context.Context, []*types.Alert, error) {
var filtered []*types.Alert
for _, a := range alerts {
// TODO(fabxc): increment total alerts counter.
// Do not send the alert if muted.
if !n.muter.Mutes(a.Labels) {
filtered = append(filtered, a)
} else {
n.postMuteHandle(a)
}
// TODO(fabxc): increment muted alerts counter if muted.
}
return ctx, filtered, nil
}
抑制條件是如何匹配的呢?
我們在設定抑制規則時,會設定抑制源和抑制目標。在啟動 Inhibitor 的時候,會先匹配抑制源(也就是Source),如果某條告警的 label 滿足抑制源的條件,則會被放入 scache 中(第17行進行匹配,在21行時匹配成功寫入 scache 中)。
func (ih *Inhibitor) run(ctx context.Context) {
it := ih.alerts.Subscribe()
defer it.Close()
for {
select {
case <-ctx.Done():
return
case a := <-it.Next():
if err := it.Err(); err != nil {
level.Error(ih.logger).Log("msg", "Error iterating alerts", "err", err)
continue
}
// Update the inhibition rules' cache.
for _, r := range ih.rules {
if r.IsExpressionMatch {
if matched, err := r.SourceExpMatcher.Match(a.Labels); err != nil {
level.Error(ih.logger).Log("msg", "Error expression match alerts", "err", err)
continue
} else if matched {
if err := r.scache.Set(a); err != nil {
level.Error(ih.logger).Log("msg", "error on set alert", "err", err)
}
}
} else if r.SourceMatchers.Match(a.Labels) {
if err := r.scache.Set(a); err != nil {
level.Error(ih.logger).Log("msg", "error on set alert", "err", err)
}
}
}
}
}
}
此時如果有新產生的告警正好滿足抑制規則的抑制目標(也就是 target)規則,那麼這條規則會被透過方法 SetInhibited 設定成為抑制。在被設定為抑制時,被抑制的告警也會被設定抑制源告警的指紋。
// Mutes returns true if the given label set is muted. It implements the Muter
// interface.
func (ih *Inhibitor) Mutes(lset model.LabelSet) bool {
fp := lset.Fingerprint()
for _, r := range ih.rules {
if r.IsExpressionMatch {
if targetMatched, err := r.TargetExpMatcher.Match(lset); err != nil {
level.Error(ih.logger).Log("msg", "Error inhibitor expression match alerts", "err", err)
continue
} else {
if !targetMatched {
continue
}
}
} else {
if !r.TargetMatchers.Match(lset) {
// If target side of rule doesn't match, we don't need to look any further.
continue
}
}
// If we are here, the target side matches. If the source side matches, too, we
// need to exclude inhibiting alerts for which the same is true.
sourceMatched := false
if r.IsExpressionMatch {
if matched, err := r.SourceExpMatcher.Match(lset); err != nil {
level.Error(ih.logger).Log("msg", "Error inhibitor expression match alerts", "err", err)
continue
} else {
sourceMatched = matched
}
} else {
sourceMatched = r.SourceMatchers.Match(lset)
}
if inhibitedByFP, eq := r.hasEqual(ih.logger, lset, sourceMatched); eq {
ih.marker.SetInhibited(fp, inhibitedByFP.String())
return true
}
}
ih.marker.SetInhibited(fp)
return false
}
Silencer 靜默
靜默規則執行MuteStage的Exec,新的告警的labels匹配到靜默規則的條件後,新的告警就會被靜默,透過SetInhibited進行標記,同時會設定抑制源告警的指紋
// Mutes implements the Muter interface.
func (s *Silencer) Mutes(lset model.LabelSet) bool {
fp := lset.Fingerprint()
ids, markerVersion, _ := s.marker.Silenced(fp)
var (
err error
sils []*pb.Silence
newVersion = markerVersion
)
if markerVersion == s.silences.Version() {
// No new silences added, just need to check which of the old
// silences are still revelant.
if len(ids) == 0 {
// Super fast path: No silences ever applied to this
// alert, none have been added. We are done.
return false
}
// This is still a quite fast path: No silences have been added,
// we only need to check which of the applicable silences are
// currently active. Note that newVersion is left at
// markerVersion because the Query call might already return a
// newer version, which is not the version our old list of
// applicable silences is based on.
sils, _, err = s.silences.Query(
QIDs(ids...),
QState(types.SilenceStateActive),
)
} else {
// New silences have been added, do a full query.
sils, newVersion, err = s.silences.Query(
QState(types.SilenceStateActive),
QMatches(lset),
)
}
if err != nil {
level.Error(s.logger).Log("msg", "Querying silences failed, alerts might not get silenced correctly", "err", err)
}
if len(sils) == 0 {
s.marker.SetSilenced(fp, newVersion)
return false
}
idsChanged := len(sils) != len(ids)
if !idsChanged {
// Length is the same, but is the content the same?
for i, s := range sils {
if ids[i] != s.Id {
idsChanged = true
break
}
}
}
if idsChanged {
// Need to recreate ids.
ids = make([]string, len(sils))
for i, s := range sils {
ids[i] = s.Id
}
sort.Strings(ids) // For comparability.
}
if idsChanged || newVersion != markerVersion {
// Update marker only if something changed.
s.marker.SetSilenced(fp, newVersion, ids...)
}
return true
}
WaitStage
WaitStage 表示向其他例項傳送 Notification Log 的時間間隔,只是單純的時間等待。
// Exec implements the Stage interface.
func (ws *WaitStage) Exec(ctx context.Context, l log.Logger, alerts ...*types.Alert) (context.Context, []*types.Alert, error) {
select {
case <-time.After(ws.wait()):
case <-ctx.Done():
return ctx, nil, ctx.Err()
}
return ctx, alerts, nil
}
DedupStage
DedupStage 主要是透過計算告警的hash值來起到去重的作用。
func (n *DedupStage) Exec(ctx context.Context, l log.Logger, alerts ...*types.Alert) (context.Context, []*types.Alert, error) {
gkey, ok := GroupKey(ctx)
if !ok {
return ctx, nil, fmt.Errorf("group key missing")
}
repeatInterval, ok := RepeatInterval(ctx)
if !ok {
return ctx, nil, fmt.Errorf("repeat interval missing")
}
firingSet := map[uint64]struct{}{}
resolvedSet := map[uint64]struct{}{}
firing := []uint64{}
resolved := []uint64{}
var hash uint64
for _, a := range alerts {
hash = n.hash(a)
if a.Resolved() {
resolved = append(resolved, hash)
resolvedSet[hash] = struct{}{}
} else {
firing = append(firing, hash)
firingSet[hash] = struct{}{}
}
}
ctx = WithFiringAlerts(ctx, firing)
ctx = WithResolvedAlerts(ctx, resolved)
entries, err := n.nflog.Query(nflog.QGroupKey(gkey), nflog.QReceiver(n.recv))
if err != nil && err != nflog.ErrNotFound {
return ctx, nil, err
}
var entry *nflogpb.Entry
switch len(entries) {
case 0:
case 1:
entry = entries[0]
default:
return ctx, nil, fmt.Errorf("unexpected entry result size %d", len(entries))
}
if n.needsUpdate(entry, firingSet, resolvedSet, repeatInterval) {
return ctx, alerts, nil
}
return ctx, nil, nil
}
RetryStage
主要是根據不同的通道來傳送告警,如果失敗,會進行重試。
func (r RetryStage) Exec(ctx context.Context, l log.Logger, alerts ...*types.Alert) (context.Context, []*types.Alert, error) {
var sent []*types.Alert
// If we shouldn't send notifications for resolved alerts, but there are only
// resolved alerts, report them all as successfully notified (we still want the
// notification log to log them for the next run of DedupStage).
if !r.integration.SendResolved() {
firing, ok := FiringAlerts(ctx)
if !ok {
return ctx, nil, fmt.Errorf("firing alerts missing")
}
if len(firing) == 0 {
return ctx, alerts, nil
}
for _, a := range alerts {
if a.Status() != model.AlertResolved {
sent = append(sent, a)
}
}
} else {
sent = alerts
}
var (
i = 0
b = backoff.NewExponentialBackOff()
tick = backoff.NewTicker(b)
iErr error
)
defer tick.Stop()
for {
i++
// Always check the context first to not notify again.
select {
case <-ctx.Done():
if iErr != nil {
return ctx, nil, iErr
}
return ctx, nil, ctx.Err()
default:
}
select {
case <-tick.C:
now := time.Now()
retry, err := r.integration.Notify(ctx, sent...)
r.metrics.notificationLatencySeconds.WithLabelValues(r.integration.Name()).Observe(time.Since(now).Seconds())
r.metrics.numNotifications.WithLabelValues(r.integration.Name()).Inc()
if err != nil {
r.metrics.numFailedNotifications.WithLabelValues(r.integration.Name()).Inc()
level.Debug(l).Log("msg", "Notify attempt failed", "attempt", i, "integration", r.integration.Name(), "receiver", r.groupName, "err", err)
if !retry {
return ctx, alerts, fmt.Errorf("cancelling notify retry for %q due to unrecoverable error: %s", r.integration.Name(), err)
}
// Save this error to be able to return the last seen error by an
// integration upon context timeout.
iErr = err
} else {
return ctx, alerts, nil
}
case <-ctx.Done():
if iErr != nil {
return ctx, nil, iErr
}
return ctx, nil, ctx.Err()
}
}
}
SetNotifiesStage
SetNotifiesStage 主要是用來確保告警已經傳送給 了通道,並記錄到 alertManager 的日誌中。
func (n SetNotifiesStage) Exec(ctx context.Context, l log.Logger, alerts ...*types.Alert) (context.Context, []*types.Alert, error) {
gkey, ok := GroupKey(ctx)
if !ok {
return ctx, nil, fmt.Errorf("group key missing")
}
firing, ok := FiringAlerts(ctx)
if !ok {
return ctx, nil, fmt.Errorf("firing alerts missing")
}
resolved, ok := ResolvedAlerts(ctx)
if !ok {
return ctx, nil, fmt.Errorf("resolved alerts missing")
}
return ctx, alerts, n.nflog.Log(n.recv, gkey, firing, resolved)
}