【TVM 教程】線性和遞迴核

超神经HyperAI發表於2024-10-29

Apache TVM 是一個端到端的深度學習編譯框架,適用於 CPU、GPU 和各種機器學習加速晶片。更多 TVM 中文文件可訪問 → https://tvm.hyper.ai/
作者:Tianqi Chen

下面介紹如何在 TVM 中進行遞迴計算(神經網路中的典型模式)。

from __future__ import absolute_import, print_function

import tvm
import tvm.testing
from tvm import te
import numpy as np

TVM 用線性運算元來描述符號迴圈。以下線性運算元計算 X 列上的累積和。

線性在張量的最高維度上進行。s_state 是描述線性轉換狀態的佔位符。s_init 描述如何初始化前 k 個時間步長,其第一個維度為 1,描述瞭如何初始化第一個時間步長的狀態。

s_update 描述瞭如何更新時間步長 t 處的值,更新的值可透過狀態佔位符引用上一個時間步長的值。注意在當前或之後的時間步長引用 s_state 是無效的。

線性包含狀態佔位符、初始值和更新描述。推薦列出線性單元的輸入,線性的結果是一個張量—— s_state 在時域更新後的結果。

m = te.var("m")
n = te.var("n")
X = te.placeholder((m, n), name="X")
s_state = te.placeholder((m, n))
s_init = te.compute((1, n), lambda _, i: X[0, i])
s_update = te.compute((m, n), lambda t, i: s_state[t - 1, i] + X[t, i])
s_scan = tvm.te.scan(s_init, s_update, s_state, inputs=[X])

排程線性單元

透過分別排程 update 和 init 部分來排程線性體。注意,排程更新部分的第一個迭代維度是無效的。要在時間迭代上拆分,使用者可以在 scan_op.scan_axis 上進行排程。

s = te.create_schedule(s_scan.op)
num_thread = 256
block_x = te.thread_axis("blockIdx.x")
thread_x = te.thread_axis("threadIdx.x")
xo, xi = s[s_init].split(s_init.op.axis[1], factor=num_thread)
s[s_init].bind(xo, block_x)
s[s_init].bind(xi, thread_x)
xo, xi = s[s_update].split(s_update.op.axis[1], factor=num_thread)
s[s_update].bind(xo, block_x)
s[s_update].bind(xi, thread_x)
print(tvm.lower(s, [X, s_scan], simple_mode=True))

輸出結果:

@main = primfn(X_1: handle, scan_1: handle) -> ()
  attr = {"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}
  buffers = {X: Buffer(X_2: Pointer(float32), float32, [(stride: int32*m: int32)], [], type="auto"),
             scan: Buffer(scan_2: Pointer(float32), float32, [(stride_1: int32*m)], [], type="auto")}
  buffer_map = {X_1: X, scan_1: scan}
  preflattened_buffer_map = {X_1: X_3: Buffer(X_2, float32, [m, n: int32], [stride, stride_2: int32], type="auto"), scan_1: scan_3: Buffer(scan_2, float32, [m, n], [stride_1, stride_3: int32], type="auto")} {
  attr [IterVar(blockIdx.x: int32, (nullptr), "ThreadIndex", "blockIdx.x")] "thread_extent" = floordiv((n + 255), 256);
  attr [IterVar(threadIdx.x: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 256;
  if @tir.likely((((blockIdx.x*256) + threadIdx.x) < n), dtype=bool) {
    scan[(((blockIdx.x*256) + threadIdx.x)*stride_3)] = X[(((blockIdx.x*256) + threadIdx.x)*stride_2)]
  }
  for (scan.idx: int32, 0, (m - 1)) {
    attr [IterVar(blockIdx.x, (nullptr), "ThreadIndex", "blockIdx.x")] "thread_extent" = floordiv((n + 255), 256);
    attr [IterVar(threadIdx.x, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 256;
    if @tir.likely((((blockIdx.x*256) + threadIdx.x) < n), dtype=bool) {
      let cse_var_1: int32 = (scan.idx + 1)
      scan[((cse_var_1*stride_1) + (((blockIdx.x*256) + threadIdx.x)*stride_3))] = (scan[((scan.idx*stride_1) + (((blockIdx.x*256) + threadIdx.x)*stride_3))] + X[((cse_var_1*stride) + (((blockIdx.x*256) + threadIdx.x)*stride_2))])
    }
  }
}

構建和驗證

可以像其他 TVM 核心一樣構建線性核心,這裡用 numpy 來驗證結果的正確性。

fscan = tvm.build(s, [X, s_scan], "cuda", name="myscan")
dev = tvm.cuda(0)
n = 1024
m = 10
a_np = np.random.uniform(size=(m, n)).astype(s_scan.dtype)
a = tvm.nd.array(a_np, dev)
b = tvm.nd.array(np.zeros((m, n), dtype=s_scan.dtype), dev)
fscan(a, b)
tvm.testing.assert_allclose(b.numpy(), np.cumsum(a_np, axis=0))

多階段線性單元

以上示例用 s_update 中的一個張量計算階段描述了線性單元,可以線上性單元中使用多個張量級。

以下程式碼演示了有兩個階段操作的線性單元中的線性過程:

m = te.var("m")
n = te.var("n")
X = te.placeholder((m, n), name="X")
s_state = te.placeholder((m, n))
s_init = te.compute((1, n), lambda _, i: X[0, i])
s_update_s1 = te.compute((m, n), lambda t, i: s_state[t - 1, i] * 2, name="s1")
s_update_s2 = te.compute((m, n), lambda t, i: s_update_s1[t, i] + X[t, i], name="s2")
s_scan = tvm.te.scan(s_init, s_update_s2, s_state, inputs=[X])

這些中間張量可以正常排程。為了確保正確性,TVM 建立了一個組約束——停用線性迴圈之外的 compute_at 位置的線性體。

s = te.create_schedule(s_scan.op)
xo, xi = s[s_update_s2].split(s_update_s2.op.axis[1], factor=32)
s[s_update_s1].compute_at(s[s_update_s2], xo)

輸出結果:

print(tvm.lower(s, [X, s_scan], simple_mode=True))
@main = primfn(X_1: handle, scan_1: handle) -> ()
  attr = {"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}
  buffers = {X: Buffer(X_2: Pointer(float32), float32, [(stride: int32*m: int32)], [], type="auto"),
             scan: Buffer(scan_2: Pointer(float32), float32, [(stride_1: int32*m)], [], type="auto")}
  buffer_map = {X_1: X, scan_1: scan}
  preflattened_buffer_map = {X_1: X_3: Buffer(X_2, float32, [m, n: int32], [stride, stride_2: int32], type="auto"), scan_1: scan_3: Buffer(scan_2, float32, [m, n], [stride_1, stride_3: int32], type="auto")} {
  allocate(s1: Pointer(global float32), float32, [32]), storage_scope = global {
    for (i: int32, 0, n) {
      scan[(i*stride_3)] = X[(i*stride_2)]
    }
    for (scan.idx: int32, 0, (m - 1)) {
      for (i.outer: int32, 0, floordiv((n + 31), 32)) {
        for (i_1: int32, 0, 32) {
          if @tir.likely((((i.outer*32) + i_1) < n), dtype=bool) {
            s1_1: Buffer(s1, float32, [32], [])[i_1] = (scan[((scan.idx*stride_1) + (((i.outer*32) + i_1)*stride_3))]*2f32)
          }
        }
        for (i.inner: int32, 0, 32) {
          if @tir.likely((((i.outer*32) + i.inner) < n), dtype=bool) {
            let cse_var_2: int32 = (scan.idx + 1)
            let cse_var_1: int32 = ((i.outer*32) + i.inner)
            scan[((cse_var_2*stride_1) + (cse_var_1*stride_3))] = (s1_1[i.inner] + X[((cse_var_2*stride) + (cse_var_1*stride_2))])
          }
        }
      }
    }
  }
}

多狀態

對於像 RNN 這樣的複雜應用,需要多個遞迴狀態。線性支援多個遞迴狀態,以下示例演示如何構建具有兩種狀態的遞迴。

m = te.var("m")
n = te.var("n")
l = te.var("l")
X = te.placeholder((m, n), name="X")
s_state1 = te.placeholder((m, n))
s_state2 = te.placeholder((m, l))
s_init1 = te.compute((1, n), lambda _, i: X[0, i])
s_init2 = te.compute((1, l), lambda _, i: 0.0)
s_update1 = te.compute((m, n), lambda t, i: s_state1[t - 1, i] + X[t, i])
s_update2 = te.compute((m, l), lambda t, i: s_state2[t - 1, i] + s_state1[t - 1, 0])
s_scan1, s_scan2 = tvm.te.scan(
    [s_init1, s_init2], [s_update1, s_update2], [s_state1, s_state2], inputs=[X]
)
s = te.create_schedule(s_scan1.op)
print(tvm.lower(s, [X, s_scan1, s_scan2], simple_mode=True))

輸出結果:

@main = primfn(X_1: handle, scan_2: handle, scan_3: handle) -> ()
  attr = {"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}
  buffers = {X: Buffer(X_2: Pointer(float32), float32, [(stride: int32*m: int32)], [], type="auto"),
             scan: Buffer(scan_4: Pointer(float32), float32, [(stride_1: int32*m)], [], type="auto"),
             scan_1: Buffer(scan_5: Pointer(float32), float32, [(stride_2: int32*m)], [], type="auto")}
  buffer_map = {X_1: X, scan_2: scan, scan_3: scan_1}
  preflattened_buffer_map = {X_1: X_3: Buffer(X_2, float32, [m, n: int32], [stride, stride_3: int32], type="auto"), scan_2: scan_6: Buffer(scan_4, float32, [m, n], [stride_1, stride_4: int32], type="auto"), scan_3: scan_7: Buffer(scan_5, float32, [m, l: int32], [stride_2, stride_5: int32], type="auto")} {
  for (i: int32, 0, n) {
    scan[(i*stride_4)] = X[(i*stride_3)]
  }
  for (i_1: int32, 0, l) {
    scan_1[(i_1*stride_5)] = 0f32
  }
  for (scan.idx: int32, 0, (m - 1)) {
    for (i_2: int32, 0, n) {
      let cse_var_1: int32 = (scan.idx + 1)
      scan[((cse_var_1*stride_1) + (i_2*stride_4))] = (scan[((scan.idx*stride_1) + (i_2*stride_4))] + X[((cse_var_1*stride) + (i_2*stride_3))])
    }
    for (i_3: int32, 0, l) {
      scan_1[(((scan.idx + 1)*stride_2) + (i_3*stride_5))] = (scan_1[((scan.idx*stride_2) + (i_3*stride_5))] + scan[(scan.idx*stride_1)])
    }
  }
}

總結

本教程演示瞭如何使用線性原語。

  • 用 init 和 update 描述線性。
  • 將線性單元當作正常 schedule 進行排程。
  • 對於複雜的工作負載,線上性單元中使用多個狀態和步驟。

下載 Python 原始碼:scan.py

下載 Jupyter Notebook:scan.ipynb

相關文章