prometheus/tsdb 的原始碼閱讀筆記 0x03

weixin_34185364發表於2018-01-17

之前的文章分段介紹了 prometheus/tsdb 下的各個 pkg 的具體內容
這篇文章將完整分析 prometheus/tsdb 本身的實現

tombstones.go

Stone

Stone 是作為刪除資料的標記

// Stone holds the information on the posting and time-range
// that is deleted.
type Stone struct {
    ref       uint64
    intervals Intervals
}
Interval, Intervals

用來記錄時間段

// Interval represents a single time-interval.
type Interval struct {
    Mint, Maxt int64
}

func (tr Interval) inBounds(t int64) bool {
    return t >= tr.Mint && t <= tr.Maxt
}

func (tr Interval) isSubrange(dranges Intervals) bool {
    for _, r := range dranges {
        if r.inBounds(tr.Mint) && r.inBounds(tr.Maxt) {
            return true
        }
    }

    return false
}
TombstoneReader
// TombstoneReader gives access to tombstone intervals by series reference.
type TombstoneReader interface {
    // Get returns deletion intervals for the series with the given reference.
    Get(ref uint64) (Intervals, error)

    // Iter calls the given function for each encountered interval.
    Iter(func(uint64, Intervals) error) error

    // Close any underlying resources
    Close() error
}

提供了一個記憶體版的實現

type memTombstones map[uint64]Intervals

var emptyTombstoneReader = memTombstones{}

// EmptyTombstoneReader returns a TombstoneReader that is always empty.
func EmptyTombstoneReader() TombstoneReader {
    return emptyTombstoneReader
}

func (t memTombstones) Get(ref uint64) (Intervals, error) {
    return t[ref], nil
}

func (t memTombstones) Iter(f func(uint64, Intervals) error) error {
    for ref, ivs := range t {
        if err := f(ref, ivs); err != nil {
            return err
        }
    }
    return nil
}

func (t memTombstones) add(ref uint64, itv Interval) {
    t[ref] = t[ref].add(itv)
}

func (memTombstones) Close() error {
    return nil
}

TombstoneReader 的內容可以被寫入檔案, 也可以通過檔案讀出.

func writeTombstoneFile(dir string, tr TombstoneReader) error {
    path := filepath.Join(dir, tombstoneFilename)
    tmp := path + ".tmp"
    
    // ...

    return renameFile(tmp, path)
}
func readTombstones(dir string) (memTombstones, error) {
    b, err := ioutil.ReadFile(filepath.Join(dir, tombstoneFilename))
    // ...

    stonesMap := memTombstones{}

    for d.len() > 0 {
        // ...
        stonesMap.add(k, Interval{mint, maxt})
    }

    return stonesMap, nil
}

wal.go

prometheus/tsdb 會將幾類資料先寫入 wal (write ahead log) 檔案

// WALEntryType indicates what data a WAL entry contains.
type WALEntryType uint8

// Entry types in a segment file.
const (
    WALEntrySymbols WALEntryType = 1
    WALEntrySeries  WALEntryType = 2
    WALEntrySamples WALEntryType = 3
    WALEntryDeletes WALEntryType = 4
)
// WAL is a write ahead log that can log new series labels and samples.
// It must be completely read before new entries are logged.
type WAL interface {
    Reader() WALReader
    LogSeries([]RefSeries) error
    LogSamples([]RefSample) error
    LogDeletes([]Stone) error
    Truncate(mint int64, keep func(uint64) bool) error
    Close() error
}

// WALReader reads entries from a WAL.
type WALReader interface {
    Read(
        seriesf func([]RefSeries),
        samplesf func([]RefSample),
        deletesf func([]Stone),
    ) error
}

與之相關的資料結構定義如下

// RefSeries is the series labels with the series ID.
type RefSeries struct {
    Ref    uint64
    Labels labels.Labels
}

// RefSample is a timestamp/value pair associated with a reference to a series.
type RefSample struct {
    Ref uint64
    T   int64
    V   float64

    // 基於記憶體的 series 資料, 在後續的閱讀中再仔細分析
    series *memSeries
}

SegmentWAL

這是 WAL 的一個實現, 會將資料切成 256MB 一片進行儲存, 切片的組織方式與 chunks 類似.

相應的, 操作檔案的相關實現程式碼也很相似.

// segmentFile wraps a file object of a segment and tracks the highest timestamp
// it contains. During WAL truncating, all segments with no higher timestamp than
// the truncation threshold can be compacted.
type segmentFile struct {
    *os.File
    maxTime   int64  // highest tombstone or sample timpstamp in segment
    minSeries uint64 // lowerst series ID in segment
}

// SegmentWAL is a write ahead log for series data.
type SegmentWAL struct {
    mtx     sync.Mutex
    metrics *walMetrics

    dirFile *os.File
    files   []*segmentFile

    logger        log.Logger
    flushInterval time.Duration
    segmentSize   int64

    crc32 hash.Hash32
    cur   *bufio.Writer
    curN  int64

    // 訊號
    stopc   chan struct{}
    donec   chan struct{}
    
    // 後臺執行的操作
    actorc  chan func() error // sequentialized background operations
    
    buffers sync.Pool
}
LogXXXX

LogSeries, LogSamples, LogDeletes 對各自的運算元據分別編碼寫入 WAL.

Truncate
// Truncate deletes the values prior to mint and the series which the keep function
// does not indiciate to preserve.
// 用於清除不再需要的資料
func (w *SegmentWAL) Truncate(mint int64, keep func(uint64) bool) error {
    // ...

    return nil
}
run

通過 OpenSegmentWAL 開啟一個 SegmentWAL 的時候, 會在一個獨立的 goroutine 中執行 run 函式, 用來處理 actorc 傳遞的後臺操作.

目前 actorc 傳遞的操作僅有檔案的分片

// cut finishes the currently active segments and opens the next one.
// The encoder is reset to point to the new segment.
func (w *SegmentWAL) cut() error {
    // Sync current head to disk and close.
    if hf := w.head(); hf != nil {
        if err := w.flush(); err != nil {
            return err
        }
        
        // Finish last segment asynchronously to not block the WAL moving along
        // in the new segment.
        // 結束當前的切片檔案
        go func() {
            w.actorc <- func() error {
                off, err := hf.Seek(0, os.SEEK_CUR)
                if err != nil {
                    return errors.Wrapf(err, "finish old segment %s", hf.Name())
                }
                if err := hf.Truncate(off); err != nil {
                    return errors.Wrapf(err, "finish old segment %s", hf.Name())
                }
                if err := hf.Sync(); err != nil {
                    return errors.Wrapf(err, "finish old segment %s", hf.Name())
                }
                if err := hf.Close(); err != nil {
                    return errors.Wrapf(err, "finish old segment %s", hf.Name())
                }
                return nil
            }
        }()
    }

    // 初始化新的切片檔案供寫入
    // ...
    
    return nil
}

Compact.go

對底層儲存的壓縮相關的實現

// Compactor provides compaction against an underlying storage
// of time series data.
type Compactor interface {
    // Plan returns a set of non-overlapping directories that can
    // be compacted concurrently.
    // Results returned when compactions are in progress are undefined.
    Plan(dir string) ([]string, error)

    // Write persists a Block into a directory.
    Write(dest string, b BlockReader, mint, maxt int64) (ulid.ULID, error)

    // Compact runs compaction against the provided directories. Must
    // only be called concurrently with results of Plan().
    Compact(dest string, dirs ...string) (ulid.ULID, error)
}
LeveledCompactor

是 Compactor 的實現

Plan
// Plan returns a list of compactable blocks in the provided directory.
func (c *LeveledCompactor) Plan(dir string) ([]string, error) {
    dirs, err := blockDirs(dir)
    
    // ...
  
    var dms []dirMeta

    for _, dir := range dirs {
        // 讀取 BlockMeta 作為判斷是否可以 compact 的依據
        meta, err := readMetaFile(dir)
        
        // ...
    }
    return c.plan(dms)
}
populateBlock

LeveledCompactor.WriteLeveledCompactor.Compact 兩個方法中都用到 LeveledCompactor.write, 而 LeveledCompactor.populateBlock 是 write 方法的重要邏輯.

其作用是將一組 Block 的資料合併, 再寫入 IndexWriter, ChunkWriter.

// populateBlock fills the index and chunk writers with new data gathered as the union
// of the provided blocks. It returns meta information for the new block.
func (c *LeveledCompactor) populateBlock(blocks []BlockReader, meta *BlockMeta, indexw IndexWriter, chunkw ChunkWriter) error {
    var (
        set        ChunkSeriesSet
        allSymbols = make(map[string]struct{}, 1<<16)
        closers    = []io.Closer{}
    )
    defer func() { closeAll(closers...) }()

    // 遍歷舊 block 資料
    for i, b := range blocks {
        indexr, err := b.Index()
        // ...

        chunkr, err := b.Chunks()
        // ...

        tombsr, err := b.Tombstones()
        // ...

        symbols, err := indexr.Symbols()
        // ...

        all, err := indexr.Postings(index.AllPostingsKey())
        if err != nil {
            return err
        }
        all = indexr.SortedPostings(all)

        s := newCompactionSeriesSet(indexr, chunkr, tombsr, all)

        // ...
      
        // 與上一層並形成一個新的 merger
        set, err = newCompactionMerger(set, s)
        if err != nil {
            return err
        }
    }

    // We fully rebuild the postings list index from merged series.
    // ...

    // 遍歷 merger
    for set.Next() {
        lset, chks, dranges := set.At() // The chunks here are not fully deleted.

        // Skip the series with all deleted chunks.
        // ...

        if err := chunkw.WriteChunks(chks...); err != nil {
            return errors.Wrap(err, "write chunks")
        }

        if err := indexw.AddSeries(i, lset, chks...); err != nil {
            return errors.Wrap(err, "add series")
        }

        // ...
    }
    
    // ...

    s := make([]string, 0, 256)
    for n, v := range values {
        // ...

        if err := indexw.WriteLabelIndex([]string{n}, s); err != nil {
            return errors.Wrap(err, "write label index")
        }
    }

    for _, l := range postings.SortedKeys() {
        if err := indexw.WritePostings(l.Name, l.Value, postings.Get(l.Name, l.Value)); err != nil {
            return errors.Wrap(err, "write postings")
        }
    }
    return nil
}

block.go

Block
Delete
// Delete matching series between mint and maxt in the block.
// 前面說到, Delete 的時候會暫時先標記為 Tombstone, 這裡即實現部分
func (pb *Block) Delete(mint, maxt int64, ms ...labels.Matcher) error {
    // ...

    err = pb.tombstones.Iter(func(id uint64, ivs Intervals) error {
        for _, iv := range ivs {
            stones.add(id, iv)
            pb.meta.Stats.NumTombstones++
        }
        return nil
    })
    if err != nil {
        return err
    }
    pb.tombstones = stones

    if err := writeTombstoneFile(pb.dir, pb.tombstones); err != nil {
        return err
    }
    return writeMetaFile(pb.dir, &pb.meta)
}
CleanTombstones
// CleanTombstones will rewrite the block if there any tombstones to remove them
// and returns if there was a re-write.
func (pb *Block) CleanTombstones(dest string, c Compactor) (bool, error) {
    numStones := 0

    pb.tombstones.Iter(func(id uint64, ivs Intervals) error {
        for _ = range ivs {
            numStones++
        }

        return nil
    })

    if numStones == 0 {
        return false, nil
    }

    if _, err := c.Write(dest, pb, pb.meta.MinTime, pb.meta.MaxTime); err != nil {
        return false, err
    }

    return true, nil
}
Snapshot

疑問, 這裡僅對目標資料夾及其內部檔案做了 hardlink, 怎麼確保內容不變?

head.go

Head

Head 向呼叫方提供, 用於某個時間段內的資料讀寫.

Head 會同時處理 WAL 內的和已經持久化的資料.

Head 可以認為是current Block

所有 Block 不可再寫入, Head 在寫入有效期過後會轉化為 Block 進行持久化.

Appender
// Appender returns a new Appender on the database.
// 會根據具體情形決定返回的 Appender 例項
// Appender 例項共兩類
// initAppender 會在接收到第一個資料點時初始化 Head 的起始時間
// headAppender 邏輯相對簡單
func (h *Head) Appender() Appender {
    h.metrics.activeAppenders.Inc()

    // The head cache might not have a starting point yet. The init appender
    // picks up the first appended timestamp as the base.
    if h.MinTime() == math.MinInt64 {
        return &initAppender{head: h}
    }
    return h.appender()
}

func (h *Head) appender() *headAppender {
    return &headAppender{
        head:          h,
        mint:          h.MaxTime() - h.chunkRange/2,
        samples:       h.getAppendBuffer(),
        highTimestamp: math.MinInt64,
    }
}

querier.go

圍繞以下三個介面, 向呼叫方提供查詢能力.

// Querier provides querying access over time series data of a fixed
// time range.
type Querier interface {
    // Select returns a set of series that matches the given label matchers.
    Select(...labels.Matcher) (SeriesSet, error)

    // LabelValues returns all potential values for a label name.
    LabelValues(string) ([]string, error)
    // LabelValuesFor returns all potential values for a label name.
    // under the constraint of another label.
    LabelValuesFor(string, labels.Label) ([]string, error)

    // Close releases the resources of the Querier.
    Close() error
}

// Series exposes a single time series.
type Series interface {
    // Labels returns the complete set of labels identifying the series.
    Labels() labels.Labels

    // Iterator returns a new iterator of the data of the series.
    Iterator() SeriesIterator
}

// SeriesSet contains a set of series.
type SeriesSet interface {
    Next() bool
    At() Series
    Err() error
}
querier, blockQuerier

blockQuerier 是針對一個 block 的 Querier

querier 是 blockQuerier 的聚合

db.go

Appender

Appender 是寫入介面, *Head 就實現了 Appender

// Appender allows appending a batch of data. It must be completed with a
// call to Commit or Rollback and must not be reused afterwards.
//
// Operations on the Appender interface are not goroutine-safe.
type Appender interface {
    // Add adds a sample pair for the given series. A reference number is
    // returned which can be used to add further samples in the same or later
    // transactions.
    // Returned reference numbers are ephemeral and may be rejected in calls
    // to AddFast() at any point. Adding the sample via Add() returns a new
    // reference number.
    // If the reference is the empty string it must not be used for caching.
    Add(l labels.Labels, t int64, v float64) (uint64, error)

    // Add adds a sample pair for the referenced series. It is generally faster
    // than adding a sample by providing its full label set.
    AddFast(ref uint64, t int64, v float64) error

    // Commit submits the collected samples and purges the batch.
    Commit() error

    // Rollback rolls back all modifications made in the appender so far.
    Rollback() error
}
DB

DB 是向呼叫者提供的最主要的結構體.

// DB handles reads and writes of time series falling into
// a hashed partition of a seriedb.
type DB struct {
    dir   string
    lockf *lockfile.Lockfile

    logger    log.Logger
    metrics   *dbMetrics
    opts      *Options
    chunkPool chunkenc.Pool
    compactor Compactor

    // Mutex for that must be held when modifying the general block layout.
    mtx    sync.RWMutex
    blocks []*Block

    head *Head

    compactc chan struct{}
    donec    chan struct{}
    stopc    chan struct{}

    // cmtx is used to control compactions and deletions.
    cmtx               sync.Mutex
    compactionsEnabled bool
}
reload
// reload on-disk blocks and trigger head truncation if new blocks appeared. It takes
// a list of block directories which should be deleted during reload.
func (db *DB) reload(deleteable ...string) (err error) {
    // ...
    
    // 讀取當前所有的 block 目錄
    dirs, err := blockDirs(db.dir)
    
    // ...
    
    var (
        blocks []*Block
        exist  = map[ulid.ULID]struct{}{}
    )

    for _, dir := range dirs {
        meta, err := readMetaFile(dir)
        
        // ...

        // 嘗試獲取目錄對應的 Block, 先從記憶體, 再從硬碟
        b, ok := db.getBlock(meta.ULID)
        if !ok {
            b, err = OpenBlock(dir, db.chunkPool)
            
            // ...
        }

        blocks = append(blocks, b)
        exist[meta.ULID] = struct{}{}
    }

    // 按照 Block 覆蓋的時間重新排序
    if err := validateBlockSequence(blocks); err != nil {
        return errors.Wrap(err, "invalid block sequence")
    }

    // ...
    
    // 清除不必要的 Block 檔案
    for _, b := range oldBlocks {
        if _, ok := exist[b.Meta().ULID]; ok {
            continue
        }
        if err := b.Close(); err != nil {
            level.Warn(db.logger).Log("msg", "closing block failed", "err", err)
        }
        if err := os.RemoveAll(b.Dir()); err != nil {
            level.Warn(db.logger).Log("msg", "deleting block failed", "err", err)
        }
    }

    // Garbage collect data in the head if the most recent persisted block
    // covers data of its current time range.
    if len(blocks) == 0 {
        return nil
    }
    maxt := blocks[len(blocks)-1].Meta().MaxTime

    return errors.Wrap(db.head.Truncate(maxt), "head truncate failed")
}
run

run 方法在 Open 時被呼叫, 在一個單獨的 goroutine 中執行, 主要是定期對資料進行壓縮以節省空間

func (db *DB) run() {
    defer close(db.donec)

    backoff := time.Duration(0)

    for {
        select {
        case <-db.stopc:
            return
        case <-time.After(backoff):
        }

        select {
        case <-time.After(1 * time.Minute):
            select {
            case db.compactc <- struct{}{}:
            default:
            }
        case <-db.compactc:
            // 執行壓縮相關程式碼

        case <-db.stopc:
            return
        }
    }
}
Appender

返回的是封裝的結果 dbAppender, 後面專門再分析

Qurier

返回的是所有指定時間範圍內的 Block 聚合

// Querier returns a new querier over the data partition for the given time range.
// A goroutine must not handle more than one open Querier.
func (db *DB) Querier(mint, maxt int64) (Querier, error) {
    var blocks []BlockReader

    db.mtx.RLock()
    defer db.mtx.RUnlock()

    for _, b := range db.blocks {
        m := b.Meta()
        
        // 找出符合時間段的 block
        if intervalOverlap(mint, maxt, m.MinTime, m.MaxTime) {
            blocks = append(blocks, b)
        }
    }
    
    // 前面提到, Head 可以視作當前 Block
    if maxt >= db.head.MinTime() {
        blocks = append(blocks, db.head)
    }

    // Block 的聚合
    sq := &querier{
        blocks: make([]Querier, 0, len(blocks)),
    }
    for _, b := range blocks {
        q, err := NewBlockQuerier(b, mint, maxt)
        if err == nil {
            sq.blocks = append(sq.blocks, q)
            continue
        }
        // If we fail, all previously opened queriers must be closed.
        for _, q := range sq.blocks {
            q.Close()
        }
        return nil, errors.Wrapf(err, "open querier for block %s", b)
    }
    return sq, nil
}
Delete

這邊實際會將 Delete 操作分給各個受影響的 Block

CleanTombstone

前面提到, 各個 Block Delete 內的邏輯實際是寫 WAL 以及 Tombstone 檔案

這裡會對當前所有 Block 真正進行清理, 然後呼叫 reload 方法.

dbAppender

是對 *headAppender 的封裝, 在 Commit 的時候觸發 compact

// Appender opens a new appender against the database.
func (db *DB) Appender() Appender {
    return dbAppender{db: db, Appender: db.head.Appender()}
}

// dbAppender wraps the DB's head appender and triggers compactions on commit
// if necessary.
type dbAppender struct {
    Appender
    db *DB
}

func (a dbAppender) Commit() error {
    err := a.Appender.Commit()

    // We could just run this check every few minutes practically. But for benchmarks
    // and high frequency use cases this is the safer way.
    if a.db.head.MaxTime()-a.db.head.MinTime() > a.db.head.chunkRange/2*3 {
        select {
        case a.db.compactc <- struct{}{}:
        default:
        }
    }
    return err
}

Summary

prometheus/tsdb (下稱 ptsdb ) 的結構體之間的層次大概可以這樣劃分:

  • DB: 對外提供的核心物件

    • Block 已經持久化的, 覆蓋某個時間段的時序資料. Block 的
      • Index: 用於儲存 labels 的索引資料
      • Chunk: 用於儲存時間戳-取樣值 資料
  • Head: 由於 ptsdb 規定, 資料必須增序寫入, 已經持久化的 Block 不能再寫入, 因此一個時刻只會有一個可供寫入的 Block, 即 Head. Head 同時還承擔記錄刪除動作的任務
    • WAL 增刪改的動作都會先進入 WAL, 供後續恢復用
    • Tombstone: 用於標記刪除動作, 被標記的資料在 compact 的時候統一清理
  • Compactor: 對檔案進行壓縮. Block 資料的組織參考了 LSM, 因此 Compactor 的實現也和基於 LSM 的 kv db 類似.

關於 ptsdb, 時間序列資料的儲存和計算 - 開源時序資料庫解析(四) 這篇文章有更巨集觀的闡述, 可以參考.

相關文章