技術分享 | AlertManager 原始碼解析

作者：石蓓蓓
愛可生研發工程師，主要負責愛可生產品雲DMP樹產品的研發工作。
本文來源：原創投稿
*愛可生開源社群出品，原創內容未經授權不得隨意使用，轉載請聯絡小編並註明來源。

AlertManager 是處理對應用程式的告警的，比如Promethus的服務端。對於輸入的告警，會經過分組、抑制、靜默、去重等步驟，最終並將告警傳送到接受者（郵箱等）。

alertManager 的框架圖如下：

今天主要是分享 AlertManager 中 Pipeline 相關的流程程式碼，pipeline 主要是用來處理分組後的告警，經過抑制、靜默、去重，然後傳送。

首先在建立 Pipeline 的時候，會建立 GossipSettleStage 、MuteStage（包含抑制和靜默）、WaitStage 、DedupStage 、RetryStage 、SetNotifiesStage 。

// New returns a map of receivers to Stages.
func (pb *PipelineBuilder) New(
 receivers map[string][]Integration,
 wait func() time.Duration,
 inhibitor *inhibit.Inhibitor,
 silencer *silence.Silencer,
 notificationLog NotificationLog,
 peer *cluster.Peer,
) RoutingStage {
 rs := make(RoutingStage, len(receivers))

 ms := NewGossipSettleStage(peer)
 is := NewMuteStage(inhibitor)
 ss := NewMuteStage(silencer)

 for name := range receivers {
  st := createReceiverStage(name, receivers[name], wait, notificationLog, pb.metrics)
  rs[name] = MultiStage{ms, is, ss, st}
 }
 return rs
}

// createReceiverStage creates a pipeline of stages for a receiver.
func createReceiverStage(
 name string,
 integrations []Integration,
 wait func() time.Duration,
 notificationLog NotificationLog,
 metrics *metrics,
) Stage {
 var fs FanoutStage
 for i := range integrations {
  recv := &nflogpb.Receiver{
   GroupName:   name,
   Integration: integrations[i].Name(),
   Idx:         uint32(integrations[i].Index()),
  }
  var s MultiStage
  s = append(s, NewWaitStage(wait))
  s = append(s, NewDedupStage(&integrations[i], notificationLog, recv))
  s = append(s, NewRetryStage(integrations[i], name, metrics))
  s = append(s, NewSetNotifiesStage(notificationLog, recv))

  fs = append(fs, s)
 }
 return fs
}

從上面的程式碼可以看到 AlertManager 在某一通道處理時會經過 GossipSettleStage 、MuteStage（包含抑制和靜默）、WaitStage 、DedupStage 、RetryStage 、SetNotifiesStage 這7個 stage ，並且順序執行。

Pipeline 的執行是遍歷了所有的 stage ，每次執行 Exec 方法（見程式碼的第8行），且每次執行後返回的 alert 列表是下一步的引數（第8行的程式碼對傳入的引數alerts賦予新的告警值，再下次執行Exec的時候傳入的alerts的值是新的值），最終得到的alert列表是經過每次過濾後的告警列表

func (ms MultiStage) Exec(ctx context.Context, l log.Logger, alerts ...*types.Alert) (context.Context, []*types.Alert, error) {
 var err error
 for _, s := range ms {
  if len(alerts) == 0 {
   return ctx, nil, nil
  }

  ctx, alerts, err = s.Exec(ctx, l, alerts...)
  if err != nil {
   return ctx, nil, err
  }
 }
 return ctx, alerts, nil
}

GossipSettle

等待叢集準備完畢。

func (n *GossipSettleStage) Exec(ctx context.Context, l log.Logger, alerts ...*types.Alert) (context.Context, []*types.Alert, error) {
 if n.peer != nil {
  n.peer.WaitReady()
 }
 return ctx, alerts, nil
}

Inhibitor 抑制

抑制首先是會執行MuteStage的Exec，再匹配到後，就不會傳送告警。主要是執行第6行的n.muter.Mutes方法來進行匹配：

func (n *MuteStage) Exec(ctx context.Context, l log.Logger, alerts ...*types.Alert) (context.Context, []*types.Alert, error) {
 var filtered []*types.Alert
 for _, a := range alerts {
  // TODO(fabxc): increment total alerts counter.
  // Do not send the alert if muted.
  if !n.muter.Mutes(a.Labels) {
   filtered = append(filtered, a)
  } else {
   n.postMuteHandle(a)
  }
  // TODO(fabxc): increment muted alerts counter if muted.
 }
 return ctx, filtered, nil
}

抑制條件是如何匹配的呢？

我們在設定抑制規則時，會設定抑制源和抑制目標。在啟動 Inhibitor 的時候，會先匹配抑制源（也就是Source），如果某條告警的 label 滿足抑制源的條件，則會被放入 scache 中（第17行進行匹配，在21行時匹配成功寫入 scache 中）。

func (ih *Inhibitor) run(ctx context.Context) {
 it := ih.alerts.Subscribe()
 defer it.Close()

 for {
  select {
  case <-ctx.Done():
   return
  case a := <-it.Next():
   if err := it.Err(); err != nil {
    level.Error(ih.logger).Log("msg", "Error iterating alerts", "err", err)
    continue
   }
   // Update the inhibition rules' cache.
   for _, r := range ih.rules {
    if r.IsExpressionMatch {
     if matched, err := r.SourceExpMatcher.Match(a.Labels); err != nil {
      level.Error(ih.logger).Log("msg", "Error expression match alerts", "err", err)
      continue
     } else if matched {
      if err := r.scache.Set(a); err != nil {
       level.Error(ih.logger).Log("msg", "error on set alert", "err", err)
      }
     }
    } else if r.SourceMatchers.Match(a.Labels) {
     if err := r.scache.Set(a); err != nil {
      level.Error(ih.logger).Log("msg", "error on set alert", "err", err)
     }
    }
   }
  }
 }
}

此時如果有新產生的告警正好滿足抑制規則的抑制目標（也就是 target）規則，那麼這條規則會被透過方法 SetInhibited 設定成為抑制。在被設定為抑制時，被抑制的告警也會被設定抑制源告警的指紋。

// Mutes returns true if the given label set is muted. It implements the Muter
// interface.
func (ih *Inhibitor) Mutes(lset model.LabelSet) bool {
fp := lset.Fingerprint()

 for _, r := range ih.rules {
if r.IsExpressionMatch {
 if targetMatched, err := r.TargetExpMatcher.Match(lset); err != nil {
  level.Error(ih.logger).Log("msg", "Error inhibitor expression match alerts", "err", err)
  continue
   } else {
  if !targetMatched {
  continue
    }
   }
  } else {
 if !r.TargetMatchers.Match(lset) {
   // If target side of rule doesn't match, we don't need to look any further.
   continue
  }
 }

 // If we are here, the target side matches. If the source side matches, too, we
 // need to exclude inhibiting alerts for which the same is true.
 sourceMatched := false
 if r.IsExpressionMatch {
  if matched, err := r.SourceExpMatcher.Match(lset); err != nil {
  level.Error(ih.logger).Log("msg", "Error inhibitor expression match alerts", "err", err)
 continue
   } else {
    sourceMatched = matched
   }
 } else {
  sourceMatched = r.SourceMatchers.Match(lset)
  }
  if inhibitedByFP, eq := r.hasEqual(ih.logger, lset, sourceMatched); eq {
   ih.marker.SetInhibited(fp, inhibitedByFP.String())
   return true
  }
 }
 ih.marker.SetInhibited(fp)

 return false
}

Silencer 靜默

靜默規則執行MuteStage的Exec，新的告警的labels匹配到靜默規則的條件後，新的告警就會被靜默，透過SetInhibited進行標記，同時會設定抑制源告警的指紋

// Mutes implements the Muter interface.
func (s *Silencer) Mutes(lset model.LabelSet) bool {
 fp := lset.Fingerprint()
 ids, markerVersion, _ := s.marker.Silenced(fp)

 var (
  err        error
  sils       []*pb.Silence
  newVersion = markerVersion
 )
 if markerVersion == s.silences.Version() {
  // No new silences added, just need to check which of the old
  // silences are still revelant.
  if len(ids) == 0 {
   // Super fast path: No silences ever applied to this
   // alert, none have been added. We are done.
   return false
  }
  // This is still a quite fast path: No silences have been added,
  // we only need to check which of the applicable silences are
  // currently active. Note that newVersion is left at
  // markerVersion because the Query call might already return a
  // newer version, which is not the version our old list of
  // applicable silences is based on.
  sils, _, err = s.silences.Query(
   QIDs(ids...),
   QState(types.SilenceStateActive),
  )
 } else {
  // New silences have been added, do a full query.
  sils, newVersion, err = s.silences.Query(
   QState(types.SilenceStateActive),
   QMatches(lset),
  )
 }
 if err != nil {
  level.Error(s.logger).Log("msg", "Querying silences failed, alerts might not get silenced correctly", "err", err)
 }
 if len(sils) == 0 {
  s.marker.SetSilenced(fp, newVersion)
  return false
 }
 idsChanged := len(sils) != len(ids)
 if !idsChanged {
  // Length is the same, but is the content the same?
  for i, s := range sils {
   if ids[i] != s.Id {
    idsChanged = true
    break
   }
  }
 }
 if idsChanged {
  // Need to recreate ids.
  ids = make([]string, len(sils))
  for i, s := range sils {
   ids[i] = s.Id
  }
  sort.Strings(ids) // For comparability.
 }
 if idsChanged || newVersion != markerVersion {
  // Update marker only if something changed.
  s.marker.SetSilenced(fp, newVersion, ids...)
 }
 return true
}

WaitStage

WaitStage 表示向其他例項傳送 Notification Log 的時間間隔，只是單純的時間等待。

// Exec implements the Stage interface.
func (ws *WaitStage) Exec(ctx context.Context, l log.Logger, alerts ...*types.Alert) (context.Context, []*types.Alert, error) {
 select {
 case <-time.After(ws.wait()):
 case <-ctx.Done():
  return ctx, nil, ctx.Err()
 }
 return ctx, alerts, nil
}

DedupStage

DedupStage 主要是透過計算告警的hash值來起到去重的作用。

func (n *DedupStage) Exec(ctx context.Context, l log.Logger, alerts ...*types.Alert) (context.Context, []*types.Alert, error) {
 gkey, ok := GroupKey(ctx)
 if !ok {
  return ctx, nil, fmt.Errorf("group key missing")
 }

 repeatInterval, ok := RepeatInterval(ctx)
 if !ok {
  return ctx, nil, fmt.Errorf("repeat interval missing")
 }

 firingSet := map[uint64]struct{}{}
 resolvedSet := map[uint64]struct{}{}
 firing := []uint64{}
 resolved := []uint64{}

 var hash uint64
 for _, a := range alerts {
  hash = n.hash(a)
  if a.Resolved() {
   resolved = append(resolved, hash)
   resolvedSet[hash] = struct{}{}
  } else {
   firing = append(firing, hash)
   firingSet[hash] = struct{}{}
  }
 }

 ctx = WithFiringAlerts(ctx, firing)
 ctx = WithResolvedAlerts(ctx, resolved)

 entries, err := n.nflog.Query(nflog.QGroupKey(gkey), nflog.QReceiver(n.recv))
 if err != nil && err != nflog.ErrNotFound {
  return ctx, nil, err
 }

 var entry *nflogpb.Entry
 switch len(entries) {
 case 0:
 case 1:
  entry = entries[0]
 default:
  return ctx, nil, fmt.Errorf("unexpected entry result size %d", len(entries))
 }

 if n.needsUpdate(entry, firingSet, resolvedSet, repeatInterval) {
  return ctx, alerts, nil
 }
 return ctx, nil, nil
}

RetryStage

主要是根據不同的通道來傳送告警，如果失敗，會進行重試。

func (r RetryStage) Exec(ctx context.Context, l log.Logger, alerts ...*types.Alert) (context.Context, []*types.Alert, error) {
 var sent []*types.Alert

 // If we shouldn't send notifications for resolved alerts, but there are only
 // resolved alerts, report them all as successfully notified (we still want the
 // notification log to log them for the next run of DedupStage).
 if !r.integration.SendResolved() {
  firing, ok := FiringAlerts(ctx)
 if !ok {
   return ctx, nil, fmt.Errorf("firing alerts missing")
  }
  if len(firing) == 0 {
   return ctx, alerts, nil
  }
  for _, a := range alerts {
   if a.Status() != model.AlertResolved {
    sent = append(sent, a)
   }
  }
 } else {
  sent = alerts
 }

 var (
  i    = 0
  b    = backoff.NewExponentialBackOff()
  tick = backoff.NewTicker(b)
  iErr error
 )
 defer tick.Stop()

 for {
  i++
  // Always check the context first to not notify again.
  select {
  case <-ctx.Done():
   if iErr != nil {
    return ctx, nil, iErr
   }

   return ctx, nil, ctx.Err()
  default:
  }

  select {
  case <-tick.C:
   now := time.Now()
   retry, err := r.integration.Notify(ctx, sent...)
   r.metrics.notificationLatencySeconds.WithLabelValues(r.integration.Name()).Observe(time.Since(now).Seconds())
   r.metrics.numNotifications.WithLabelValues(r.integration.Name()).Inc()
   if err != nil {
    r.metrics.numFailedNotifications.WithLabelValues(r.integration.Name()).Inc()
    level.Debug(l).Log("msg", "Notify attempt failed", "attempt", i, "integration", r.integration.Name(), "receiver", r.groupName, "err", err)
    if !retry {
     return ctx, alerts, fmt.Errorf("cancelling notify retry for %q due to unrecoverable error: %s", r.integration.Name(), err)
    }

    // Save this error to be able to return the last seen error by an
    // integration upon context timeout.
    iErr = err
   } else {
    return ctx, alerts, nil
   }
  case <-ctx.Done():
   if iErr != nil {
    return ctx, nil, iErr
   }

   return ctx, nil, ctx.Err()
  }
 }
}

SetNotifiesStage

SetNotifiesStage 主要是用來確保告警已經傳送給了通道，並記錄到 alertManager 的日誌中。

func (n SetNotifiesStage) Exec(ctx context.Context, l log.Logger, alerts ...*types.Alert) (context.Context, []*types.Alert, error) {
gkey, ok := GroupKey(ctx)
if !ok {
return ctx, nil, fmt.Errorf("group key missing")
}

firing, ok := FiringAlerts(ctx)
if !ok {
return ctx, nil, fmt.Errorf("firing alerts missing")
}

resolved, ok := ResolvedAlerts(ctx)
if !ok {
return ctx, nil, fmt.Errorf("resolved alerts missing")
}

return ctx, alerts, n.nflog.Log(n.recv, gkey, firing, resolved)
}