MShadow中的表示式模板

ouyang發表於2022-02-27

原文網址 : https://www.cnblogs.com/shuo-ouyang/p/13914706.html

表示式模板是Eigen、GSL和boost.uBLAS等高效能C++矩陣庫的核心技術。本文基於MXNet給出的教程文件來闡述MXNet所依賴的高效能矩陣庫MShadow背後的原理。

編寫高效的機器學習程式碼

我們先來思考一個問題：如何才能編寫出高效的機器學習程式碼？假設DNN模型按照下面的程式碼進行權重更新，其中weight和grad都是長度為n的vector：

weight = -eta * (grad + lambda * weight)

既然我們選擇C++來實現矩陣計算，那麼效能肯定是最先要考慮的因素。在C++程式設計中，一個重要的原則就是——預先分配好所需的記憶體，不要在執行時申請分配臨時記憶體。因此，我們可能會實現一個名為UpdateWeight的函式，其形參grad和weight均為指標：

void UpdateWeight(const float *grad, float eta, float lambda,
                  int n, float *weight) {
  for (int i = 0; i < n; ++i) {
    weight[i] =  -eta * (grad[i] + lambda * weight[i]);
  }
}

指標grad和weight所指向的記憶體空間都是預先分配好的，函式在執行期只需要執行計算。顯然，上面的程式碼非常簡單直觀，但是，當我們反覆編寫它們時可能會很煩（指編寫迴圈處理每個元素）。因此問題是，我們能否像下面這樣編寫程式碼，同時又能獲得上述程式碼的效能呢？答案是肯定的，但是方法可能並沒有那麼直觀。

void UpdateWeight(const Vec& grad, float eta, float lambda, Vec& weight) {
  weight = -eta * (grad + lambda * weight);
}

運算子過載

運算子過載是一種非常容易想到的解決方案。通過過載相應的運算子，我們可以將元素處理的細節隱藏在運算子中，簡單地呼叫運算子就可以實現相應的操作。

// Naive solution for vector operation overloading
struct Vec {
  int len;
  float* dptr;
  Vec(int len) : len(len) {
    dptr = new float[len];
  }
  Vec(const Vec& src) : len(src.len) {
    dptr = new float[len];
    memcpy(dptr, src.dptr, sizeof(float)*len );
  }
  ~Vec(void) {
    delete [] dptr;
  }
};

inline Vec operator+(const Vec &lhs, const Vec &rhs) {
  Vec res(lhs.len);
  for (int i = 0; i < lhs.len; ++i) {
    res.dptr[i] = lhs.dptr[i] + rhs.dptr[i];
  }
  return res;
}

然而，這種方法並不高效，原因是每次呼叫運算子時都會有記憶體空間的申請和釋放。另一種更高效的方法是僅過載運算子+=和-=，他們無需臨時記憶體分配即可實現，但這又限制了我們可以呼叫的運算子的數量，得不償失。下一小節，我們將介紹如何利用表示式模板實現延遲計算。

延遲計算

在呼叫operator+時，因為我們不知道運算子的結果要賦值給哪個變數，所以需要申請一塊臨時記憶體空間把結果儲存下來。否則，如果我們能提前直到運算結果要存放在哪個變數中，那麼就可以直接將結果儲存到相應的記憶體空間。下面的程式碼說明了這一情況：

// Example Lazy evaluation code
// for simplicity, we use struct and make all members public
#include <cstdio>
struct Vec;
// expression structure holds the expression
struct BinaryAddExp {
  const Vec &lhs;
  const Vec &rhs;
  BinaryAddExp(const Vec &lhs, const Vec &rhs)
  : lhs(lhs), rhs(rhs) {}
};
// no constructor and destructor to allocate and de-allocate memory,
//  allocation done by user
struct Vec {
  int len;
  float* dptr;
  Vec(void) {}
  Vec(float *dptr, int len)
      : len(len), dptr(dptr) {}
  // here is where evaluation happens
  inline Vec &operator=(const BinaryAddExp &src) {
    for (int i = 0; i < len; ++i) {
      dptr[i] = src.lhs.dptr[i] + src.rhs.dptr[i];
    }
    return *this;
  }
};
// no evaluation happens here
inline BinaryAddExp operator+(const Vec &lhs, const Vec &rhs) {
  return BinaryAddExp(lhs, rhs);
}

const int n = 3;
int main(void) {
  float sa[n] = {1, 2, 3};
  float sb[n] = {2, 3, 4};
  float sc[n] = {3, 4, 5};
  Vec A(sa, n), B(sb, n), C(sc, n);
  // run expression
  A = B + C;
  for (int i = 0; i < n; ++i) {
    printf("%d:%f==%f+%f\\n", i, A.dptr[i], B.dptr[i], C.dptr[i]);
  }
  return 0;
}

在這段程式碼中，運算子operator+不進行實際的計算，只返回一個表示式結構BinaryAddExp，它裡面儲存了進行向量加法的兩個運算元。當過載operator=時，我們就可以知道向量加法的目標變數以及對應的兩個運算元，因此，在這種情況下，不需要任何（執行時）記憶體分配就可以執行計算操作！類似的，我們可以定義一個DotExp並在operator=處進行惰性求值，然後在內部呼叫BLAS實現矩陣乘法。

更復雜的表示式以及表示式模板

使用延遲計算能夠避免執行期的臨時記憶體分配。但是，上一小節的程式碼仍然面臨以下兩個問題：

只能編寫形如A=B+C的表示式，不能編寫類似A=B+C+D等更加複雜的表示式
如果想新增更多的表示式，就需要編寫更多的operator=來執行相應的計算

上述問題的解決方法就是使用模板程式設計。我們將BinaryAddExp實現成一個模板類，它儲存的兩個運算元都是模板，這樣就能夠實現任意長度的加法表示式，具體程式碼如下。

// Example code, expression template, and more length equations
// for simplicity, we use struct and make all members public
#include <cstdio>

// this is expression, all expressions must inheritate it,
//  and put their type in subtype
template<typename SubType>
struct Exp {
  // returns const reference of the actual type of this expression
  inline const SubType& self(void) const {
    return *static_cast<const SubType*>(this);
  }
};

// binary add expression
// note how it is inheritates from Exp
// and put its own type into the template argument
template<typename TLhs, typename TRhs>
struct BinaryAddExp: public Exp<BinaryAddExp<TLhs, TRhs> > {
  const TLhs &lhs;
  const TRhs &rhs;
  BinaryAddExp(const TLhs& lhs, const TRhs& rhs)
      : lhs(lhs), rhs(rhs) {}
  // evaluation function, evaluate this expression at position i
  inline float Eval(int i) const {
    return lhs.Eval(i) + rhs.Eval(i);
  }
};
// no constructor and destructor to allocate
// and de-allocate memory, allocation done by user
struct Vec: public Exp<Vec> {
  int len;
  float* dptr;
  Vec(void) {}
  Vec(float *dptr, int len)
      :len(len), dptr(dptr) {}
  // here is where evaluation happens
  template<typename EType>
  inline Vec& operator= (const Exp<EType>& src_) {
    const EType &src = src_.self();
    for (int i = 0; i < len; ++i) {
      dptr[i] = src.Eval(i);
    }
    return *this;
  }
  // evaluation function, evaluate this expression at position i
  inline float Eval(int i) const {
    return dptr[i];
  }
};
// template add, works for any expressions
template<typename TLhs, typename TRhs>
inline BinaryAddExp<TLhs, TRhs>
operator+(const Exp<TLhs> &lhs, const Exp<TRhs> &rhs) {
  return BinaryAddExp<TLhs, TRhs>(lhs.self(), rhs.self());
}

const int n = 3;
int main(void) {
  float sa[n] = {1, 2, 3};
  float sb[n] = {2, 3, 4};
  float sc[n] = {3, 4, 5};
  Vec A(sa, n), B(sb, n), C(sc, n);
  // run expression, this expression is longer:)
  A = B + C + C;
  for (int i = 0; i < n; ++i) {
    printf("%d:%f == %f + %f + %f\\n", i,
           A.dptr[i], B.dptr[i],
           C.dptr[i], C.dptr[i]);
  }
  return 0;
}

程式碼的關鍵思想是模板Exp<SubType>將其派生類SubType的型別作為模板引數，因此它可以通過self()將其自身轉換為SubType，這種模式被稱為奇異遞迴模板模式，簡稱CRTP。BinaryAddExp現在是一個模板類，可以將多個表示式複合在一起。真正的計算操作是通過函式Eval完成的，該函式在BinaryAddExp中以遞迴方式實現，operator=中的函式呼叫src.Eval(i)會被編譯成B.dptr[i] + C.dptr[i] + C.dptr[i]。

靈活性

前面的示例讓我們領略到模板程式設計的強大功能，而這最後一個示例則與MShadow的實現更為接近，它允許使用者自定義二元操作符。

// Example code, expression template
// with binary operator definition and extension
// for simplicity, we use struct and make all members public
#include <cstdio>

// this is expression, all expressions must inheritate it,
// and put their type in subtype
template<typename SubType>
struct Exp{
  // returns const reference of the actual type of this expression
  inline const SubType& self(void) const {
    return *static_cast<const SubType*>(this);
  }
};

// binary operators
struct mul{
  inline static float Map(float a, float b) {
    return a * b;
  }
};

// binary add expression
// note how it is inheritates from Exp
// and put its own type into the template argument
template<typename OP, typename TLhs, typename TRhs>
struct BinaryMapExp: public Exp<BinaryMapExp<OP, TLhs, TRhs> >{
  const TLhs& lhs;
  const TRhs& rhs;
  BinaryMapExp(const TLhs& lhs, const TRhs& rhs)
      :lhs(lhs), rhs(rhs) {}
  // evaluation function, evaluate this expression at position i
  inline float Eval(int i) const {
    return OP::Map(lhs.Eval(i), rhs.Eval(i));
  }
};
// no constructor and destructor to allocate and de-allocate memory
// allocation done by user
struct Vec: public Exp<Vec>{
  int len;
  float* dptr;
  Vec(void) {}
  Vec(float *dptr, int len)
      : len(len), dptr(dptr) {}
  // here is where evaluation happens
  template<typename EType>
  inline Vec& operator=(const Exp<EType>& src_) {
    const EType &src = src_.self();
    for (int i = 0; i < len; ++i) {
      dptr[i] = src.Eval(i);
    }
    return *this;
  }
  // evaluation function, evaluate this expression at position i
  inline float Eval(int i) const {
    return dptr[i];
  }
};
// template binary operation, works for any expressions
template<typename OP, typename TLhs, typename TRhs>
inline BinaryMapExp<OP, TLhs, TRhs>
F(const Exp<TLhs>& lhs, const Exp<TRhs>& rhs) {
  return BinaryMapExp<OP, TLhs, TRhs>(lhs.self(), rhs.self());
}

template<typename TLhs, typename TRhs>
inline BinaryMapExp<mul, TLhs, TRhs>
operator*(const Exp<TLhs>& lhs, const Exp<TRhs>& rhs) {
  return F<mul>(lhs, rhs);
}

// user defined operation
struct maximum{
  inline static float Map(float a, float b) {
    return a > b ? a : b;
  }
};

const int n = 3;
int main(void) {
  float sa[n] = {1, 2, 3};
  float sb[n] = {2, 3, 4};
  float sc[n] = {3, 4, 5};
  Vec A(sa, n), B(sb, n), C(sc, n);
  // run expression, this expression is longer:)
  A = B * F<maximum>(C, B);
  for (int i = 0; i < n; ++i) {
    printf("%d:%f == %f * max(%f, %f)\\n",
           i, A.dptr[i], B.dptr[i], C.dptr[i], B.dptr[i]);
  }
  return 0;
}

這段程式碼與上一小節程式碼的主要區別是模板類BinaryMapExp可以接受任意型別的二元操作符，要求是該操作符必須要實現一個Map函式。個人理解，第62行實現的那個F函式主要是為了編寫程式碼方便，如果沒有它，那麼第69行就要寫成BinaryMapExp<mul, TLhs, TRhs>(lhs.self(), rhs.self());，寫起來就比較麻煩。其他的地方基本上與前一小節的程式碼差不多，稍微一看就能明白。

小結

綜上所述，表示式模板基本工作原理包括以下幾點：

延遲計算，允許我們提前知道運算子和目標變數
組合模板與遞迴計算，允許我們執行任意element-wise操作的複合表示式
由於模板和內聯的存在，表示式模板能夠像編寫for迴圈那樣高效的實現element-wise的計算