[原始碼解析] NVIDIA HugeCTR，GPU 版本引數伺服器---(8) ---Distributed Hash之後向傳播

/**
 * All the CUDA kernel functions used by embedding layer are defined in this file, including
 * forward propagation, backward propagation. The functions are defined by propagation type
 * and combiner type(sum or mean) as below:
 *   1) forward
 *        sum: calling forward_sum_kernel()
 *        mean: calling foward_sum_kernel() + forward_scale_kernel()
 *   2) backward:
 *        calculating wgrad:
 *          sum: calling backward_sum_kernel()
 *          mean: calling backward_mean_kernel()
 *        update embedding table: including several steps as below,
 *          step1: expand sample IDs, calling sample_id_expand_kernel()
 *          step2: get value_index by key (will call hash_table->get_insert() in nv_hashtable lib)
 *          step3: sort by value_index (will call cub::DeviceRadixSort::SortPairs in cub lib)
 *          step4: count the number for each unduplicated value_index, calling value_count_kernel()
 *          step5: use optimizer method to compute deltaw, and record corresponding, including three
 * types of optimizer: Adam: caling opt_adam_kernel() Momentum sgd: calling
 * opt_momentum_sgd_kernel() Nesterov: calling opt_nesterov_kernel() step6: update embedding table
 * by deltaw, calling update_kernel()
 */

2.2 程式碼

在 session::train() 之中有如下程式碼，這些就對應了總體思路。

backward 進行反向傳播計算。
exchange_wgrad 進行交換梯度。
update_params 來更新引數。

      // Embedding backward
      for (const auto& one_embedding : embeddings_) {
        one_embedding->backward();
      }

      // Exchange wgrad and update params
      if (networks_.size() > 1) {
#pragma omp parallel num_threads(networks_.size())
        {
          size_t id = omp_get_thread_num();
          exchange_wgrad(id);
          networks_[id]->update_params();
        }
      } else if (resource_manager_->get_global_gpu_count() > 1) {
        exchange_wgrad(0);
        networks_[0]->update_params();
      }
      for (const auto& one_embedding : embeddings_) {
        one_embedding->update_params();
      }

0x03 輸入

我們首先看看如何獲取反向傳播的輸入。因為從嵌入層比較難以查詢，我們換個思路，從 reshape 層來看看。

3.1 定義

可以看到，其主要成員變數就是輸入 in_tensors_ 和輸出 out_tensors_。

/**
 * Layer which reshapes a 3D/2D input tensor to 2D output tensor,
 * e.g., (batch_size, n_slots, vector_size) to (batch_size, n_slots * vector_size),
 * e.g., (batch_size * n_slots, vector_size) to (batch_size, n_slots * vector_size),
 * If the input tensor is 3D, you can choose which slots participate by calling the different Ctor
 */
template <typename T>
class ReshapeLayerCPU : public LayerCPU {
  /*
   * stores the weight tensors of this layer.
   */
  Tensors2<T> weights_;
  /*
   * stores the weight gradient tensors of this layer.
   */
  Tensors2<T> wgrad_;
  /*
   * stores the references to the input tensors of this layer.
   */
  Tensors2<T> in_tensors_;
  /*
   * stores the references to the output tensors of this layer.
   */
  Tensors2<T> out_tensors_;

  bool in_place_;
  int batch_size_;
  int n_slot_;
  int vector_length_;
  size_t n_active_slot_;
  Tensor2<int> selected_tensor_;
  std::vector<int> selected_;
}

3.2 切換

從程式碼可以知道，在訓練時候就是反覆利用了這兩個成員變數 in_tensor 和 out_tensor 來做切換。

前向傳播時候，fprop是把資料從in_tensor拷貝到out_tensor。
後向傳播時候，bprop 是把資料從out_tensor拷貝到in_tensor。

所以，前向傳播的輸入變數，在反向傳播時候被用來作為輸入變數。因此我們可以知道嵌入層也是這個套路。

template <typename T>
void ReshapeLayer<T>::fprop(bool is_train) {
  prop_common(true, is_train, get_gpu().get_stream());
}

template <typename T>
void ReshapeLayer<T>::bprop() {
  prop_common(false, true, get_gpu().get_stream());
}

template <typename T>
void ReshapeLayer<T>::prop_common(bool forward, bool is_train, cudaStream_t stream) {
  CudaDeviceContext context(get_device_id());
  Tensor2<T>& in_tensor = get_in_tensors(is_train)[0];
  Tensor2<T>& out_tensor = out_tensors_[0];

  if (in_place_) {
    if (forward) { // 前向傳播
      CK_CUDA_THROW_(cudaMemcpyAsync(out_tensor.get_ptr(), in_tensor.get_ptr(),
                                     in_tensor.get_size_in_bytes(), cudaMemcpyDeviceToDevice,
                                     stream));
    } else { // 反向傳播
      CK_CUDA_THROW_(cudaMemcpyAsync(in_tensor.get_ptr(), out_tensor.get_ptr(),
                                     out_tensor.get_size_in_bytes(), cudaMemcpyDeviceToDevice,
                                     stream));
    }
  } else {
    int block_size = 128;
    int n_block = get_gpu().get_sm_count() * 16;
    T* in = in_tensor.get_ptr();
    T* out = out_tensor.get_ptr();
    reshape_kernel<<<n_block, block_size>>>(in, out, batch_size_, n_slot_, vector_length_,
                                            selected_tensor_.get_ptr(), n_active_slot_, forward);
  }
#ifndef NDEBUG
  CK_CUDA_THROW_(cudaDeviceSynchronize());
  CK_CUDA_THROW_(cudaGetLastError());
#endif
}

0x04 backward

4.1 總體程式碼

由之前分析我們可以知道，反向傳播時候，輸入的梯度就是儲存在embedding_data_.get_output_tensors(true)之中。總體程式碼分為兩部分，第一步是使用all-gather 操作來在每個GPU之上都收集到所有樣本的全部梯度。第二步是呼叫 functors_.backward進行計算。

/** 
 * The first stage of backward propagation of embedding layer,
 * which only computes the wgrad by the dgrad from the top layer.
 */
void backward() override {
  // Read dgrad from output_tensors -> compute wgrad

  // do all-gather to collect the top_grad
  size_t send_count = embedding_data_.get_batch_size_per_gpu(true) *
                      embedding_data_.embedding_params_.slot_num *
                      embedding_data_.embedding_params_.embedding_vec_size;
  functors_.all_gather(send_count, embedding_data_.get_output_tensors(true),
                       embedding_feature_tensors_, embedding_data_.get_resource_manager());

  // do backward
  functors_.backward(embedding_data_.embedding_params_.get_batch_size(true),
                     embedding_data_.embedding_params_.slot_num,
                     embedding_data_.embedding_params_.embedding_vec_size,
                     embedding_data_.embedding_params_.combiner, row_offset_allreduce_tensors_,
                     embedding_feature_tensors_, wgrad_tensors_,
                     embedding_data_.get_resource_manager());

  return;
}

4.2 AllGather

反向傳播的第一步是使用 all-gather 操作來在每個 GPU 之上都收集到的所有樣本的全部梯度，這樣後續可以進行計算並且更新每個 GPU 之上的引數。

4.2.1 原理

首先我們看 AllGather 原理。在執行 AllGather 操作時，K個處理器之中，每個處理器都會將來自每個處理器的N個值聚整合一個維度為K*N的輸出。輸出是按rank索引排序的。AllGather操作會受到不同rank或裝置對映的影響，因為rank決定了資料佈局。

注意：執行ReduceScatter + AllGather，就等同於AllReduce操作。

4.2.2 程式碼

呼叫程式碼如下，可以看到其會把梯度從反向傳播的輸入 embedding_data_.get_output_tensors(true) 拷貝到 embedding_feature_tensors_。因此，embedding_feature_tensors_ 將會擁有所有的梯度。

  functors_.all_gather(send_count, embedding_data_.get_output_tensors(true),
                       embedding_feature_tensors_, embedding_data_.get_resource_manager());

運算元如下：

/**
 * collection communication: all_gather.
 * @param send_count the count of elements will be sent.
 * @param send_tensors the send tensors of multi GPUs.
 * @param recv_tensors the recv tensors of multi GPUs.
 * @param device_resources all gpus device resources.
 * @param context gpu device context, for switching device.
 */
template <typename Type>
void SparseEmbeddingFunctors::all_gather(size_t send_count, const Tensors2<Type> &send_tensors,
                                         Tensors2<Type> &recv_tensors,
                                         const ResourceManager &resource_manager) {
  size_t local_gpu_count = resource_manager.get_local_gpu_count();
  size_t total_gpu_count = resource_manager.get_global_gpu_count();

  // need to know the Type
  ncclDataType_t type;
  switch (sizeof(Type)) {
    case 2:
      type = ncclHalf;
      break;
    case 4:
      type = ncclFloat;
      break;
    default:
      CK_THROW_(Error_t::WrongInput, "Error: Type not support by now");
  }

  // for multi GPUs, use NCCL to do All-Gather
  if (total_gpu_count > 1) {
    CK_NCCL_THROW_(ncclGroupStart());
    for (size_t id = 0; id < local_gpu_count; id++) {
      const auto &local_gpu = resource_manager.get_local_gpu(id);
      CK_NCCL_THROW_(ncclAllGather(send_tensors[id].get_ptr(),  // send buff
                                   recv_tensors[id].get_ptr(),  // recv buff
                                   send_count, type, local_gpu->get_nccl(),
                                   local_gpu->get_stream()));
    }
    CK_NCCL_THROW_(ncclGroupEnd());
  }
  // for single GPU, just do memcpyD2D
  else {  // total_gpu_count == 1
    const auto &local_gpu = resource_manager.get_local_gpu(0);
    CudaDeviceContext context(local_gpu->get_device_id());
    CK_CUDA_THROW_(cudaMemcpyAsync(recv_tensors[0].get_ptr(), send_tensors[0].get_ptr(),
                                   send_count * sizeof(Type), cudaMemcpyDeviceToDevice,
                                   local_gpu->get_stream()));
  }

  return;
}

4.3 backward

這部分完成如下功能：計算本地每個gpu上的梯度。此函式完成之後，wgrad_tensors_ 成員變數就是本GPU計算產生的新梯度。

// do backward
functors_.backward(embedding_data_.embedding_params_.get_batch_size(true),
                   embedding_data_.embedding_params_.slot_num,
                   embedding_data_.embedding_params_.embedding_vec_size,
                   embedding_data_.embedding_params_.combiner, row_offset_allreduce_tensors_,
                   embedding_feature_tensors_, wgrad_tensors_,
                   embedding_data_.get_resource_manager());

calculating wgrad，會選擇如下兩種之一：

sum: calling backward_sum_kernel() ；
mean: calling backward_mean_kernel()；

具體backward程式碼如下：

template <typename TypeHashKey, typename TypeEmbeddingComp>
void SparseEmbeddingFunctors::backward(size_t batch_size,
                                       const std::vector<size_t> &slot_num_per_gpu,
                                       size_t embedding_vec_size, int combiner,
                                       const Tensors2<TypeHashKey> &row_offset_allreduce_tensors,
                                       const Tensors2<TypeEmbeddingComp> &embedding_feature_tensors,
                                       Tensors2<TypeEmbeddingComp> &wgrad_tensors,
                                       const ResourceManager &resource_manager) {
  size_t local_gpu_count = resource_manager.get_local_gpu_count();

  CudaDeviceContext context;
  for (size_t id = 0; id < local_gpu_count; id++) { // 遍歷本地GPU
    if (slot_num_per_gpu[id] == 0) {
      continue;
    }

    const auto &local_gpu = resource_manager.get_local_gpu(id);
    context.set_device(local_gpu->get_device_id());
    // 拿到某一個GPU對應的梯度和offset資訊
    const TypeEmbeddingComp *top_grad = embedding_feature_tensors[id].get_ptr();
    const TypeHashKey *row_offset = row_offset_allreduce_tensors[id].get_ptr();
    TypeEmbeddingComp *wgrad = wgrad_tensors[id].get_ptr();

    // 計算更新本地梯度
    if (combiner == 0)  // sum
    {
      backward_sum(batch_size, slot_num_per_gpu[id], embedding_vec_size, top_grad, wgrad,
                   local_gpu->get_stream());
    } else if (combiner == 1)  // mean
    {
      backward_mean(batch_size, slot_num_per_gpu[id], embedding_vec_size, row_offset, top_grad,
                    wgrad, local_gpu->get_stream());
    } else {
      CK_THROW_(Error_t::WrongInput, "Invalid combiner type ");
    }
  }

  return;
}

我們以backward_sum 為例，這裡採用了GPU多執行緒更新以加快速度。

template <typename TypeEmbeddingComp>
void backward_sum(size_t batch_size, size_t slot_num, size_t embedding_vec_size,
                  const TypeEmbeddingComp *top_grad, TypeEmbeddingComp *wgrad,
                  cudaStream_t stream) {
  const size_t grid_size = batch_size;  // each block corresponds to a sample
  const size_t block_size = embedding_vec_size;
  backward_sum_kernel<<<grid_size, block_size, 0, stream>>>(batch_size, slot_num,
                                                            embedding_vec_size, top_grad, wgrad);
}

// backward kernel function: for combiner=sum
template <typename TypeEmbeddingComp>
__global__ void backward_sum_kernel(int batch_size, int slot_num, int embedding_vec_size,
                                    const TypeEmbeddingComp *top_grad, TypeEmbeddingComp *wgrad) {
  int tid = threadIdx.x;
  int bid = blockIdx.x;

  if (bid < batch_size && tid < embedding_vec_size) {
    for (int i = 0; i < slot_num; i++) {
      // 先找到某一個稠密張量的位置，再加上tid就得到了張量之中某一個元素（本tid對應的元素）的位置
      size_t feature_index = (size_t)(bid * slot_num + i) * embedding_vec_size + tid;
      // 更新梯度數值
      wgrad[feature_index] = top_grad[feature_index];
    }
  }
}

作為對比，貼出backward_mean_kernel，大家可以比對學習。

// backward kernel function: for combiner=mean
template <typename TypeKey, typename TypeEmbeddingComp>
__global__ void backward_mean_kernel(int batch_size, int slot_num, int embedding_vec_size,
                                     const TypeKey *row_offset, const TypeEmbeddingComp *top_grad,
                                     TypeEmbeddingComp *wgrad) {
  int bid = blockIdx.x;
  int tid = threadIdx.x;

  if (bid < batch_size && tid < embedding_vec_size) {
    for (int i = 0; i < slot_num; i++) {
      size_t feature_row_index = bid * slot_num + i;
      int value_num = row_offset[feature_row_index + 1] - row_offset[feature_row_index];
      float scaler = 1.0f;
      if (value_num > 1) {
        scaler = 1.0f / value_num;  // partial derivatice of MEAN
      }

      size_t feature_index = feature_row_index * embedding_vec_size + tid;
      float g = TypeConvertFunc<float, TypeEmbeddingComp>::convert(top_grad[feature_index]);
      g *= scaler;
      wgrad[feature_index] = TypeConvertFunc<TypeEmbeddingComp, float>::convert(g);
    }
  }
}

現在，wgrad_tensors_ 之中已經是本地 GPU 產生的梯度了，需要根據這個來更新嵌入層權重，就是更新 hash_table_value 的內容。

0x05 ExchangeWgrad

session.train 接下來會交換梯度和更新網路引數。

      // Exchange wgrad and update params
      if (networks_.size() > 1) {
#pragma omp parallel num_threads(networks_.size())
        {
          size_t id = omp_get_thread_num();
          exchange_wgrad(id);
          networks_[id]->update_params();
        }
      } else if (resource_manager_->get_global_gpu_count() > 1) {
        exchange_wgrad(0);
        networks_[0]->update_params();
      }

具體程式碼如下：

void Session::exchange_wgrad(size_t device_id) {
  auto& gpu_resource = resource_manager_->get_local_gpu(device_id);
  CudaCPUDeviceContext context(gpu_resource->get_device_id());
  PROFILE_RECORD("exchange_wgrad.start", gpu_resource->get_stream(), false);
  exchange_wgrad_->allreduce(device_id, gpu_resource->get_stream());
  PROFILE_RECORD("exchange_wgrad.stop", gpu_resource->get_stream(), false);
}

5.1 定義

從定義可以看到，ExchangeWgrad 的功能就是簡單封裝底層資源。

class ExchangeWgrad {
 public:
  virtual void allocate() = 0;
  virtual void update_embed_wgrad_size(size_t size) = 0;
  virtual void allreduce(size_t device_id, cudaStream_t stream) = 0;
};

template <typename TypeFP>
class NetworkExchangeWgrad : public ExchangeWgrad {
 public:
  const BuffPtrs<TypeFP>& get_network_wgrad_buffs() const { return network_wgrad_buffs_; }
  const BuffPtrs<TypeFP>& get_embed_wgrad_buffs() const { return null_wgrad_buffs_; }
  void allocate() final;
  void update_embed_wgrad_size(size_t size) final;
  void allreduce(size_t device_id, cudaStream_t stream);
  NetworkExchangeWgrad(const std::shared_ptr<ResourceManager>& resource_manager);
  ~NetworkExchangeWgrad() = default;

 private:
  BuffPtrs<TypeFP> network_wgrad_buffs_;
  BuffPtrs<TypeFP> null_wgrad_buffs_;
  std::vector<std::shared_ptr<GeneralBuffer2<CudaAllocator>>> bufs_;
  std::shared_ptr<ResourceManager> resource_manager_;

  AllReduceInPlaceComm::Handle ar_handle_;

  size_t network_wgrad_size_ = 0;
  size_t num_gpus_ = 0;
};

template <typename TypeFP>
class GroupedExchangeWgrad : public ExchangeWgrad {
 public:
  const BuffPtrs<TypeFP>& get_network_wgrad_buffs() const { return network_wgrad_buffs_; }
  const BuffPtrs<TypeFP>& get_embed_wgrad_buffs() const { return embed_wgrad_buffs_; }
  void allocate() final;
  void update_embed_wgrad_size(size_t size) final;
  void allreduce(size_t device_id, cudaStream_t stream);
  GroupedExchangeWgrad(const std::shared_ptr<ResourceManager>& resource_manager);
  ~GroupedExchangeWgrad() = default;

 private:
  BuffPtrs<TypeFP> network_wgrad_buffs_;
  BuffPtrs<TypeFP> embed_wgrad_buffs_;
  std::vector<std::shared_ptr<GeneralBuffer2<CudaAllocator>>> bufs_;
  std::shared_ptr<ResourceManager> resource_manager_;

  AllReduceInPlaceComm::Handle ar_handle_;

  size_t network_wgrad_size_ = 0;
  size_t embed_wgrad_size_ = 0;
  size_t num_gpus_ = 0;
};

5.2 功能

交換功能主要是使用底層 all_reduce 來完成操作。

template <typename T>
void NetworkExchangeWgrad<T>::allreduce(size_t device_id, cudaStream_t stream) {
  auto ar_comm = resource_manager_->get_ar_comm();
  ar_comm->all_reduce(ar_handle_, stream, device_id);
}

template <typename T>
void GroupedExchangeWgrad<T>::allreduce(size_t device_id, cudaStream_t stream) {
  auto ar_comm = resource_manager_->get_ar_comm();
  ar_comm->all_reduce(ar_handle_, stream, device_id);
}

0x06 更新引數

Session.train 接下來會讓嵌入層來更新引數，具體是使用優化器進行更新。

      for (const auto& one_embedding : embeddings_) {
        one_embedding->update_params();
      }

具體程式碼如下，其主要邏輯就是在優化器和backward()產生的wgrad合作下，更新hash table。

  /**
   * The second stage of backward propagation of embedding layer, which
   * updates the hash table by wgrad(from backward()) and optimizer.
   */
  void update_params() override {
    // accumulate times for adam optimizer
    embedding_data_.embedding_params_.opt_params.hyperparams.adam.times++;
#pragma omp parallel num_threads(embedding_data_.get_resource_manager().get_local_gpu_count())
    {
      size_t id = omp_get_thread_num();
      CudaDeviceContext context(embedding_data_.get_local_gpu(id).get_device_id());
      // do update params operation
      embedding_optimizers_[id].update(
          embedding_data_.embedding_params_.get_batch_size(true),
          embedding_data_.embedding_params_.slot_num,
          embedding_data_.embedding_params_.embedding_vec_size, max_vocabulary_size_per_gpu_,
          *embedding_data_.get_nnz_array(true)[id],
          embedding_data_.get_row_offsets_tensors(true)[id], hash_value_index_tensors_[id],
          wgrad_tensors_[id], hash_table_value_tensors_[id],
          embedding_data_.get_local_gpu(id).get_sm_count(),
          embedding_data_.get_local_gpu(id).get_stream());
    }

    return;
  }

這部分是反向操作的難點。現在的問題是，wgrad_tensors_ 之中已經是梯度了，需要根據這個來更新嵌入層權重，就是 hash_table_value。但是如何更新呢？比如怎樣利用GPU多執行緒更新？是否需要更新 hash_value_index_index？我們接下來一步一步分析。

6.1 問題和思路

假如batch_size=2，slot_num=2，給出一個CSR例子格式如下（兩個樣本）：

*   40,50,10,20 // 樣本1，slot 1
*   30,50,10 // 樣本1，slot 2
*   30,20 // 樣本2，slot 1
*   10 // 樣本2，slot 2
* Will be convert to the form of:
* row offset: 0,4,7,9,10
* value: 40,50,10,20,30,50,10,30,20,10

6.1.1 前向傳播

下圖是前向傳播的embedding look示例，最後生成的 embedding_feature 之中，embedding vector個數是：batch_size x slot_num，針對我們的例子：40,50,10,20,30,50,10,30,20,10，分成slot就是：[40,50,10,20],[30,50,10],[30,20],[10]。分別對應embedding_feature矩陣中的四行。

注：最後輸出的是 train_output_tensors_，中間變數為 embedding_feature，embedding_feature 經過了幾次GPU之間的通訊變化之後演化成了train_output_tensors_ ，兩者維度相同，所以我們就使用 embedding_feature。下面圖之中數字是構造出來，只供演示使用。

我們給出 embedding_feature 之中第三條向量的計算過程，他對應了第二個樣本的第一個slot，就是 "30,20"。所以就是從 hash_table_value 選出了第2行，第3行，對應位置元素相加，即圖中給出的計算過程。

6.1.2 後向傳播

我們再考慮後向傳播。

後向傳播時候用梯度來更新權重，g31,g32,g33,g34 這一行就應該更新 hash_table_value 的第2行，第3行。另外，如果假設第二個樣本的第一個slot 是 "30,20,20,20"，那麼其實就應該用梯度更新hash_table_value 的第2行三次，第3行一次。其實也可以看出來，這種更新不要知道 train_value的數值究竟是什麼。

6.1.3 思路

我們先用常規思路來梳理一下上面例子：

sample_id 列表對應的是40,50,10,20,.....，20 是一個key，它在低維嵌入表 hash_table_value 之中對應一個稠密向量（第2行 10,20,30,40），裡面是權重。
嵌入層輸出是embedding_feature。
- embedding vector個數是：batch_size x slot_num，也就是說，CSR 有幾行，這裡就有幾個向量。
- 其中第三條向量對應了第二個樣本的第一個slot，就是 "30,20"。所以就是從 hash_table_value 選出了第2行，第3行，對應位置元素相加： 10,220,330,440,550 = (10+100),(20+200),(30+300),(40+400),(50+500)。
如果有了梯度稠密向量，其是被 hash table value 若干稠密向量做pooling得到的結果。
- 比如，梯度矩陣第三條向量 g31,g32,g33,g34 對應的就是 embedding_feature 第三條向量 10,220,330,440,550，如果梯度更新權重，就應該更新hash_table_value 的第2行，第3行。
- 如果樣本slot之中有多個同樣數值，比如第二個樣本的第一個slot 是 "30,20,20,20"，那麼其實就應該用更新hash_table_value 的第 2 行三次，第 3 行一次。

我們接著從CUDA角度來看如何更新，其目的是讓每一個block 更新一個低維矩陣 hash_table_value 的一行，所以有幾個問題：

如何依據本GPU執行緒的 block id 找到其在低維矩稠密向量陣之中的row offset，假設是第二行。
如何知道本 block 應該更新第二行幾次。
更新這幾次，分別用哪一個梯度來更新。
- 比如第1個梯度可能更新第二行，第三個梯度也可能更新第二行。針對我們的例子：40,50,10,20,30,50,10,30,20,10，分成slot就是：[40,50,10,20],[30,50,10],[30,20],[10]。分別對應梯度矩陣中的四行，所以需要從梯度矩陣之中1，2，4行的梯度來更新 10 對應的 hash_table_value。
- 具體參見下圖，這裡 train_value 到 gradient 只是示意，就是邏輯上一一對應。

這裡有一個疑問，為什麼不像前向傳播那樣操作，而是要另外重起爐灶呢？這是因為我們不需要知道樣本數值就可以更新權重，不需要把40,50,10,20,.....,等等重新走一遍操作雜湊表的流程。所以，接下來就看看HugeCTR如何解決這幾個問題，這裡程式碼比較燒腦。

6.2 嵌入層更新

我們首先看看嵌入層的總體程式碼和註釋裡面提到的思路。

6.2.1 註釋

註釋裡面關於更新的部分有5步，我們可以看到其大致思路：

     step1: expand sample IDs, calling sample_id_expand_kernel()；

     step2: get value_index by key (will call hash_table->get_insert() in nv_hashtable lib)；

     step3: sort by value_index (will call cub::DeviceRadixSort::SortPairs in cub lib)；

     step4: count the number for each unduplicated value_index, calling value_count_kernel()；

     step5: use optimizer method to compute deltaw, and record corresponding；

/**
 * All the CUDA kernel functions used by embedding layer are defined in this file, including
 * forward propagation, backward propagation. The functions are defined by propagation type
 * and combiner type(sum or mean) as below:
 *   1) forward
 *        sum: calling forward_sum_kernel()
 *        mean: calling foward_sum_kernel() + forward_scale_kernel()
 *   2) backward:
 *        calculating wgrad:
 *          sum: calling backward_sum_kernel()
 *          mean: calling backward_mean_kernel()
 *        update embedding table: including several steps as below,
 *          step1: expand sample IDs, calling sample_id_expand_kernel()
 *          step2: get value_index by key (will call hash_table->get_insert() in nv_hashtable lib)
 *          step3: sort by value_index (will call cub::DeviceRadixSort::SortPairs in cub lib)
 *          step4: count the number for each unduplicated value_index, calling value_count_kernel()
 *          step5: use optimizer method to compute deltaw, and record corresponding, including three
 * types of optimizer: Adam: caling opt_adam_kernel() Momentum sgd: calling
 * opt_momentum_sgd_kernel() Nesterov: calling opt_nesterov_kernel() step6: update embedding table
 * by deltaw, calling update_kernel()
 */

6.2.2 update程式碼

我們摘錄 EmbeddingOptimizer::update 的程式碼如下，這裡只是選擇了Optimizer_t::AdaGrad相關部分，其通過 opt_adagrad_kernel 進行更新。這裡可以清楚看到註釋中的各個步驟，我們接下來就會逐一分析。

template <typename TypeHashKey, typename TypeEmbeddingComp>
void EmbeddingOptimizer<TypeHashKey, TypeEmbeddingComp>::update(
    size_t batch_size, size_t slot_num, size_t embedding_vec_size,
    size_t max_vocabulary_size_per_gpu, size_t nnz, const Tensor2<TypeHashKey> &row_offset,
    Tensor2<size_t> &hash_value_index, const Tensor2<TypeEmbeddingComp> &wgrad,
    Tensor2<float> &hash_table_value, size_t sm_count, cudaStream_t stream) {
  OptimizerTensor<TypeEmbeddingComp> &opt_tensor = opt_tensors_;
  OptParams &opt_params = param.opt_params;
  Tensor2<TypeHashKey> &sample_id = sample_id_tensors_;
  Tensor2<TypeHashKey> &sample_id_sort = sample_id_sort_tensors_;
  Tensor2<size_t> &hash_value_index_sort = hash_value_index_sort_tensors_;
  Tensor2<uint32_t> &hash_value_index_count_offset = hash_value_index_count_offset_tensors_;
  Tensor2<uint32_t> &new_hash_value_flag = new_hash_value_flag_tensors_;
  Tensor2<uint32_t> &hash_value_flag_sumed = hash_value_flag_sumed_tensors_;
  Tensor2<uint32_t> &hash_value_index_count_counter = hash_value_index_count_counter_tensors_;
  Tensor2<void> &temp_storage_sort = temp_storage_sort_tensors_;
  Tensor2<void> &temp_storage_scan = temp_storage_scan_tensors_;

  size_t block_size, grid_size;

  try {
    // step1: expand sample IDs
    block_size = 64;
    grid_size = (batch_size * slot_num - 1) / block_size + 1;
    sample_id_expand_kernel<<<grid_size, block_size, 0, stream>>>(
        batch_size, slot_num, row_offset.get_ptr(), sample_id.get_ptr());

    if (opt_params.optimizer == Optimizer_t::SGD &&
        opt_params.hyperparams.sgd.atomic_update) {  // for SGD, do atomic update
      const size_t block_size = embedding_vec_size;
      const size_t grid_size = min(max(1ul, nnz), sm_count * 32);

      float lr_scale = opt_params.lr / opt_params.scaler;
      opt_sgd_atomic_kernel<<<grid_size, block_size, 0, stream>>>(
          nnz, embedding_vec_size, lr_scale, hash_value_index.get_ptr(), sample_id.get_ptr(),
          wgrad.get_ptr(), hash_table_value.get_ptr());
    } else {
      // step3: sort by hash_value_index
      int end_bit = static_cast<int>(log2(static_cast<float>(max_vocabulary_size_per_gpu))) + 1;
      size_t temp_storage_sort_size = temp_storage_sort.get_size_in_bytes();
      CK_CUDA_THROW_(cub::DeviceRadixSort::SortPairs(
          temp_storage_sort.get_ptr(), temp_storage_sort_size, hash_value_index.get_ptr(),
          hash_value_index_sort.get_ptr(), sample_id.get_ptr(), sample_id_sort.get_ptr(), nnz, 0,
          end_bit, stream, false));

      // step4: count the number for each unduplicated hash_value_index
      CK_CUDA_THROW_(
          cudaMemsetAsync(hash_value_index_count_counter.get_ptr(), 0, sizeof(uint32_t), stream));

      constexpr size_t max_grid_size = 384;
      block_size = 256;
      grid_size = min(max_grid_size, (nnz - 1) / block_size + 1);

      value_count_kernel_1<<<grid_size, block_size, 0, stream>>>(
          nnz, hash_value_index_sort.get_ptr(), new_hash_value_flag.get_ptr());

      // a pinned memroy
      CK_CUDA_THROW_(cudaMemcpyAsync(&hash_hash_value_index_count_num,
                                     hash_value_index_count_counter.get_ptr(), sizeof(uint32_t),
                                     cudaMemcpyDeviceToHost, stream));

      // step5: use optimizer method to compute deltaw and update the parameters
      block_size = embedding_vec_size;
      grid_size = max(1, hash_hash_value_index_count_num);

      switch (opt_params.update_type) {
        case Update_t::Global: {
          switch (opt_params.optimizer) {
            case Optimizer_t::Adam: {
            }
            case Optimizer_t::AdaGrad: {
              opt_adagrad_kernel<<<grid_size, block_size, 0, stream>>>(
                  hash_hash_value_index_count_num, embedding_vec_size, opt_params.lr,
                  opt_params.hyperparams.adagrad, opt_tensor.opt_accm_tensors_.get_ptr(),
                  sample_id_sort.get_ptr(), hash_value_index_sort.get_ptr(),
                  hash_value_index_count_offset.get_ptr(), wgrad.get_ptr(),
                  hash_table_value.get_ptr(), opt_params.scaler);
              break;
            }
            case Optimizer_t::MomentumSGD:
            case Optimizer_t::Nesterov:
            case Optimizer_t::SGD:
            default:
              CK_THROW_(Error_t::WrongInput, "Error: Invalid opitimizer type");
          }  // switch (optimizer)
          break;
        }
        case Update_t::Local: {
          switch (opt_params.optimizer) {
            case Optimizer_t::Adam: {
            }
            case Optimizer_t::AdaGrad: {
              opt_adagrad_kernel<<<grid_size, block_size, 0, stream>>>(
                  hash_hash_value_index_count_num, embedding_vec_size, opt_params.lr,
                  opt_params.hyperparams.adagrad, opt_tensor.opt_accm_tensors_.get_ptr(),
                  sample_id_sort.get_ptr(), hash_value_index_sort.get_ptr(),
                  hash_value_index_count_offset.get_ptr(), wgrad.get_ptr(),
                  hash_table_value.get_ptr(), opt_params.scaler);
              break;
            }
            case Optimizer_t::MomentumSGD:
            case Optimizer_t::Nesterov:
            case Optimizer_t::SGD:
            default:
              CK_THROW_(Error_t::WrongInput, "Error: Invalid opitimizer type");
          }  // switch (optimizer)
          break;
        }
        case Update_t::LazyGlobal: {
          switch (opt_params.optimizer) {
            case Optimizer_t::Adam: {
            }
            case Optimizer_t::AdaGrad:
            case Optimizer_t::MomentumSGD:
            case Optimizer_t::Nesterov:
            case Optimizer_t::SGD: {
              CK_THROW_(Error_t::WrongInput,
                        "Error: lazy global update is only implemented for Adam");
              break;
            }
            default:
              CK_THROW_(Error_t::WrongInput, "Error: Invalid opitimizer type");
          }
          break;
        }
        default:
          CK_THROW_(Error_t::WrongInput, "Error: Invalid update type");
      }  // switch (update type)
    }
#ifndef NDEBUG
    cudaDeviceSynchronize();
    CK_CUDA_THROW_(cudaGetLastError());
#endif
  } catch (const std::runtime_error &rt_err) {
    std::cerr << rt_err.what() << std::endl;
    throw;
  }

  return;
}

首先要說明，這裡nnz（non-zero feature number per batch）來自如下，就是本樣本之中非零key個數。

std::vector<std::shared_ptr<size_t>>& get_nnz_array(bool is_train) { 
	if (is_train) {    
		return train_nnz_array_;  
	} else {    
		return evaluate_nnz_array_;  
	}
}

我們接下來逐一看看這些步驟。

6.3 擴充sample id

這裡對應了第一步，在後續程式碼之中，每個key對應了一個sample ID。總體思路就是找到每個 key（sample ID）和梯度矩陣，或者說和embedding_feature之中哪一行相對應，我們後續就直接以 embedding_feature來看，暫時不考慮梯度矩陣。可以大致理解為把樣本id擴充套件為key id的列表。

step1: expand sample IDs, calling sample_id_expand_kernel()

就是呼叫 sample_id_expand_kernel 來擴充sample id。這裡 sample_id 是成員變數 sample_id_tensors_的引用，這樣就可以直接修改成員變數。

Tensor2<TypeHashKey> sample_id_tensors_; /**< The temp memory to store the sample ids of hash table value in update_params(). */

具體程式碼如下：

Tensor2<TypeHashKey> &sample_id = sample_id_tensors_;

// step1: expand sample IDs
block_size = 64;
grid_size = (batch_size * slot_num - 1) / block_size + 1;
sample_id_expand_kernel<<<grid_size, block_size, 0, stream>>>(
    batch_size, slot_num, row_offset.get_ptr(), sample_id.get_ptr());

通過前面分析我們知道，embedding vector個數是：batch_size x slot_num，也就是說，CSR 有幾行，這裡就有幾個向量。所以這裡就直接讀取CSR行資訊即可。即， sample_id_expand_kernel 會把 sample_id_tensors_ 設定為 CSR row offset（expand sample id by row_offset），就是找到 CSR row offset 之中的index。

CSR row_offset = [0,4,7,9,10]，樣本之中key的數值是40,50,10,20,30,50,10,30,20,10，那麼 40,50,10,20對應了 0，30,50,10對應了1，30,20對應了 2，10對應了3。因此，sample_id 數值是 [0,0,0,0,1,1,1,2,2,3]，就是記錄了該 batch 在 embedding_feature_tensors_ 之中的 row index。

sample_id_expand_kernel 程式碼如下，這裡幾個重點：

gid 是grid ID，表示本執行緒對應了embedding_feature_tensors_ 哪個元素。
blockIdx 表示一個樣本。
(batch_size * slot_num) 代表本batch在嵌入層輸出 train_output_tensors_ 之中對應了多少行，或者說是在 embedding_feature_tensors_ 之中佔據了多少行，其實 embedding_feature_tensors_ 也就這麼大。
sample_id[offset + i] = gid; 目的就是記錄該樣本某key在 embedding_feature_tensors_ 之中的 row index（對應哪一行）。embedding_feature_tensors_ 這個稠密向量是由 hash_table_value 之中"CSR 本行的元素數目"個稠密向量做pooling得到的結果。

// expand sample id by row_offset
template <typename TypeKey>
__global__ void sample_id_expand_kernel(int batch_size, int slot_num, const TypeKey *row_offset, TypeKey *sample_id) {
  
  // 本執行緒對應的grid id，其實對應的就是global thread id
  int gid = blockIdx.x * blockDim.x + threadIdx.x; 

  if (gid < (batch_size * slot_num)) { // 假如batch_size=2，slot_num=2，取值為 gid < 4
    // 並不是每個GPU執行緒都會走到這裡，對應我們的假設，則只會取出gid = 0~3 這樣的執行緒才會進行下面配置操作
    // 比如，假定gid取值範圍8，那麼只有gid=0,gid=1,gid=2,gid=3 這幾個執行緒會進入if，執行操作，其餘執行緒不會進入，比如grid=4就不會進入
    TypeKey offset = row_offset[gid]; // 拿到對應的個數，比如 row_offset[0]，row_offset[1]，row_offset[2]的數值
    int value_num = row_offset[gid + 1] - offset; // 拿到CSR 本行的元素數目
    for (int i = 0; i < value_num; i++) {
      sample_id[offset + i] = gid; // 記錄該樣本某key在 embedding_feature_tensors_ 之中的 row index
    }
  }
}

我們把目前涉及的變數整理如下，這裡假定從CSR數值到hash_value_index_tensors_ 行的對映是取十位數，比如50就對映到第5行。

名稱	數值	意義
CSR row offset	0,4,7,9,10	兩個樣本，兩個slot，所以分成四行
CSR value	40,50,10,20,30,50,10,30,20,10	樣本內容
hash_value_index_tensors_	4,5,1,2,3,5,1,3,2,1	低維嵌入表的index，樣本每個key對應一個，比如50對應了 hash_table_value 第5行
hash_table_value	5 x 8 的矩陣	低維嵌入表，假定稠密向量長度是8，因為一共只有5個不同數字，所以只有5行
embedding_feature_tensors_	4 x 8 的矩陣	嵌入層輸出的稠密向量。形狀是(batch_size * slot_num) * embedding_vec_len
sample_id	0,0,0,0,1,1,1,2,2,3	每個樣本的每個key 對應了embedding_feature_tensors_ 中的 row index。比如CSR第一行是40,50,10,20，它們都為 embedding_feature_tensors_ 的第一行做出了貢獻。

6.4 從key得到value_index

下面我們看看第二步，根據key獲取到 hash table value index。

step2: get value_index by key (will call hash_table->get_insert() in nv_hashtable lib)

這部分只是在 test/utest/embedding/sparse_embedding_hash_cpu.hpp 之中有，因為是測試程式碼，所以此時雜湊表沒有資料，需要設定，訓練程式碼不需要這一步。

對應程式碼就是：

// step2: do hash table get() value_index by key
int nnz = row_offset_[batchsize_ * slot_num_];
hash_table_->get(hash_key_.get(), hash_value_index_.get(), nnz);

HashTableCpu 的get方法如下：

  void get(const KeyType* keys, ValType* vals, size_t len) const {
    if (len == 0) {
      return;
    }
    for (size_t i = 0; i < len; i++) {
      auto it = table_->find(keys[i]);
      assert(it != table_->end() && "error: can't find key");
      vals[i] = it->second;
    }
  }

6.5 排序

這部分對應第三步：

step3: sort by value_index (will call cub::DeviceRadixSort::SortPairs in cub lib)

現在得到了：sample_id 數值是 [0,0,0,0,1,1,1,2,2,3]，就是記錄了該 batch 在 embedding_feature_tensors_ 之中的 row index。

就是把 sample_id 按照 hash_value_index 來排序，最後排序結果放入 hash_value_index_sort 和 sample_id_sort。在我們例子之中，得到結果如下：hash_value_index_sort 是 [1,1,1,2,2,3,3,4,5,5]。sample_id_sort 是 [0,1,3,0,2,1,2,0,0,1 ]。

我們還是用表格記錄：

名稱	數值	意義
CSR row offset	0,4,7,9,10	兩個樣本，兩個slot，所以分成四行
CSR value	40,50,10,20,30,50,10,30,20,10	樣本內容
hash_value_index_tensors_	4,5,1,2,3,5,1,3,2,1	低維嵌入表的index，樣本每個key對應一個，比如50對應了 hash_table_value 第5行
hash_table_value	5 x 8 的矩陣	低維嵌入表，假定稠密向量長度是8，因為一共只有5個不同數字，所以只有5行
embedding_feature_tensors_	4 x 8 的矩陣	嵌入層輸出的稠密向量。形狀是(batch_size * slot_num) * embedding_vec_len
sample_id	0,0,0,0,1,1,1,2,2,3	每個樣本的每個key 對應了embedding_feature_tensors_ 中的 row index。比如CSR第一行是40,50,10,20，它們都為 embedding_feature_tensors_ 的第一行做出了貢獻。
sample_id_sort	[0,1,3,0,2,1,2,0,0,1 ]	和 hash_value_index_sort 對應，就是 hash_value_index_sort 前三個 1 分別對應了embedding_feature 的第1行，第2行，第4行（從0開始的序列）
hash_value_index_sort	[1,1,1,2,2,3,3,4,5,5]	排序之後的結果，舉例來說，111 意思是本batch之中，一共有3個key對最終embedding_feature第一行做出了貢獻

具體程式碼如下：

// step3: sort by hash_value_index
int end_bit = static_cast<int>(log2(static_cast<float>(max_vocabulary_size_per_gpu))) + 1;
size_t temp_storage_sort_size = temp_storage_sort.get_size_in_bytes();
CK_CUDA_THROW_(cub::DeviceRadixSort::SortPairs(
    temp_storage_sort.get_ptr(), temp_storage_sort_size, hash_value_index.get_ptr(),
    hash_value_index_sort.get_ptr(), sample_id.get_ptr(), sample_id_sort.get_ptr(), nnz, 0,
    end_bit, stream, false));

6.5.1 SortPairs

這裡依然用到了CUB的方法，具體可以參見：https://nvlabs.github.io/cub/structcub_1_1_device_radix_sort.html#a9e14a29dc4ba6d68dc804bc6b0da7dd4。

方法宣告如下：

template<typename KeyT , typename ValueT >
static CUB_RUNTIME_FUNCTION cudaError_t cub::DeviceRadixSort::SortPairs	(	
  void * 	d_temp_storage,
  size_t & 	temp_storage_bytes,
  const KeyT * 	d_keys_in,
  KeyT * 	d_keys_out,
  const ValueT * 	d_values_in,
  ValueT * 	d_values_out, 	
  int 	num_items,
  int 	begin_bit = 0,
  int 	end_bit = sizeof(KeyT) * 8,
  cudaStream_t 	stream = 0,
  bool 	debug_synchronous = false 
)

具體使用方法如下：

6.6 計算value_index對應的數目

現在知道了 hash_value_index_sort 是 [1,1,1,2,2,3,3,4,5,5]，sample_id_sort 是 [0,1,3,0,2,1,2,0,0,1 ]。

hash_value_index_sort 是hash_value_index排序之後的結果，舉例來說，111 意思是本batch之中，一共有3個key對最終embedding_feature第一行做出了貢獻
sample_id_sort 和 hash_value_index_sort 對應，就是 hash_value_index_sort 前三個 1 分別對應了embedding_feature 的第1行，第2行，第4行（從0開始的序列）

接下來需要知道 embedding_feature_tensors_ 每行的來源是多少個 hash_table_value 行，比如第0行有4個，第1行有3個......。embedding_feature_tensors_ 之中的一個行是被同一個slot的多個 hash_table_value 行的稠密向量做pooling完成的。

這部分對應瞭如下：

step4: count the number for each unduplicated value_index, calling value_count_kernel()

就是對 hash_value_index_sort 進行處理，這裡是 embedding 表 hash_table_value 的 row index。

// step4: count the number for each unduplicated hash_value_index
CK_CUDA_THROW_(
    cudaMemsetAsync(hash_value_index_count_counter.get_ptr(), 0, sizeof(uint32_t), stream));

constexpr size_t max_grid_size = 384;
block_size = 256;
grid_size = min(max_grid_size, (nnz - 1) / block_size + 1);

// 目的是找到新的group，就是新的 row index。目的是為了計算每個row index對應的sample id個數
value_count_kernel_1<<<grid_size, block_size, 0, stream>>>(
    nnz, hash_value_index_sort.get_ptr(), new_hash_value_flag.get_ptr());

// prefix_sum
size_t temp_storage_scan_size = temp_storage_scan.get_size_in_bytes();
CK_CUDA_THROW_(cub::DeviceScan::InclusiveSum(
    temp_storage_scan.get_ptr(), temp_storage_scan_size, new_hash_value_flag.get_ptr(),
    hash_value_flag_sumed.get_ptr(), nnz, stream));

value_count_kernel_2<<<grid_size, block_size, 0, stream>>>(
    nnz, new_hash_value_flag.get_ptr(), hash_value_flag_sumed.get_ptr(),
    hash_value_index_count_offset.get_ptr(), hash_value_index_count_counter.get_ptr());

uint32_t hash_hash_value_index_count_num = 0;
// this async memcpy will not perform as a async operation because the host memory is not
// a pinned memroy
CK_CUDA_THROW_(cudaMemcpyAsync(&hash_hash_value_index_count_num,
                               hash_value_index_count_counter.get_ptr(), sizeof(uint32_t),
                               cudaMemcpyDeviceToHost, stream));

我們接下來一點點分析。

6.6.1 value_count_kernel_1

value_count_kernel_1目的是找到新的group，就是新的 row index。目的是為了計算每個row index對應的sample id 個數。就是找到哪些點是新行起始點。我們擴充表格如下。

名稱	數值	意義
CSR row offset	0,4,7,9,10	兩個樣本，兩個slot，所以分成四行
CSR value	40,50,10,20,30,50,10,30,20,10	樣本內容
hash_value_index_tensors_	4,5,1,2,3,5,1,3,2,1	低維嵌入表的index，樣本每個key對應一個，比如50對應了 hash_table_value 第5行
sample_id	0,0,0,0,1,1,1,2,2,3	每個樣本的每個key 對應了embedding_feature_tensors_ 中的 row index。比如CSR第一行是40,50,10,20，它們都為 embedding_feature_tensors_ 的第一行做出了貢獻。
sample_id_sort	[0,1,3,0,2,1,2,0,0,1 ]	和 hash_value_index_sort 對應，就是 hash_value_index_sort 前三個 1 分別對應了 embedding_feature 的第1行，第2行，第4行（從0開始的序列）
hash_value_index_sort	[1,1,1,2,2,3,3,4,5,5]	排序之後的結果，舉例來說，1,1,1 意思是本batch之中，一共有3個key對最終embedding_feature第一行做出了貢獻
new_hash_value_flag	[1,0,0,1,0,1,0,1,1,0]	為了計算每個row index對應的sample id 個數。就是找到哪些點是新行起始點

具體程式碼如下：

__global__ void value_count_kernel_1(int nnz, const size_t *hash_value_index_sort,
                                     uint32_t *new_hash_value_flag) {
  for (int gid = blockIdx.x * blockDim.x + threadIdx.x; gid < nnz; gid += blockDim.x * gridDim.x) {
    size_t cur_value = hash_value_index_sort[gid];
    if (gid > 0) {
      size_t former_value = hash_value_index_sort[gid - 1];
      // decide if this is the start of a group(the elements in this group have the same
      // hash_value_index_sort)
      if (cur_value != former_value) {
        new_hash_value_flag[gid] = 1;
      } else {
        new_hash_value_flag[gid] = 0;
      }
    } else {  // gid == 0
      new_hash_value_flag[gid] = 1;
    }
  }
}

6.6.2 prefix_sum

對 new_hash_value_flag 排序，目的是得到每個group（row index）內部包含多少元素，放入 hash_value_flag_sumed 之中。

// prefix_sum
size_t temp_storage_scan_size = temp_storage_scan.get_size_in_bytes();
CK_CUDA_THROW_(cub::DeviceScan::InclusiveSum(
    temp_storage_scan.get_ptr(), temp_storage_scan_size, new_hash_value_flag.get_ptr(),
    hash_value_flag_sumed.get_ptr(), nnz, stream));

這裡使用了 cub::DeviceScan::InclusiveSum，如果想深入研究，可以參見 https://nvlabs.github.io/cub/structcub_1_1_device_scan.html 。

以下是函式說明。

以下是使用方法。

我們擴充表格如下。

名稱	數值	意義
CSR row offset	0,4,7,9,10	兩個樣本，兩個slot，所以分成四行
CSR value	40,50,10,20,30,50,10,30,20,10	樣本內容
hash_value_index_tensors_	[4,5,1,2,3,5,1,3,2,1]	低維嵌入表的index，樣本每個key對應一個，比如50對應了 hash_table_value 第5行
sample_id	[0,0,0,0,1,1,1,2,2,3]	每個樣本的每個key 對應了embedding_feature_tensors_ 中的 row index。比如CSR第一行是40,50,10,20，它們都為 embedding_feature_tensors_ 的第一行做出了貢獻。
sample_id_sort	[0,1,3,0,2,1,2,0,0,1]	和 hash_value_index_sort 對應，就是 hash_value_index_sort 前三個 1 分別對應了 embedding_feature 的第1行，第2行，第4行（從0開始的序列）
hash_value_index_sort	[1,1,1,2,2,3,3,4,5,5]	排序之後的結果，舉例來說，1,1,1 意思是本batch之中，一共有3個key對最終embedding_feature第一行做出了貢獻
new_hash_value_flag	[1,0,0,1,0,1,0,1,1,0]	為了計算每個row index對應的sample id 個數。就是找到哪些點是新行起始點
hash_value_flag_sumed	[1,1,1,2,2,3,3,4,5,5]	對 new_hash_value_flag 合併，目的是得到每個group（row index）內部包含多少元素。
hash_table_value	5 x 8 的矩陣	低維嵌入表，假定稠密向量長度是8，因為一共只有5個不同數字，所以只有5行

6.6.3 value_count_kernel_2

這個程式碼作用就是得到最終每行元素個數。

value_count_kernel_2<<<grid_size, block_size, 0, stream>>>(
    nnz, new_hash_value_flag.get_ptr(), hash_value_flag_sumed.get_ptr(),
    hash_value_index_count_offset.get_ptr(), hash_value_index_count_counter.get_ptr());

uint32_t hash_hash_value_index_count_num = 0;
// this async memcpy will not perform as a async operation because the host memory is not
// a pinned memroy
CK_CUDA_THROW_(cudaMemcpyAsync(&hash_hash_value_index_count_num,
                               hash_value_index_count_counter.get_ptr(), sizeof(uint32_t),
                               cudaMemcpyDeviceToHost, stream));

hash_hash_value_index_count_num 是index總數，就是一共真實有幾行，其對應了nnz。

* @param nnz non-zero feature number per batch

現在知道了 hash_value_index_sort 是 [1,1,1,2,2,3,3,4,5,5]，sample_id_sort 是 [0,1,3,0,2,1,2,0,0,1 ]，new_hash_value_flag 是 [1,0,0,1,0,1,0,1,1,0]，裡面放置了本行是不是新行。hash_value_flag_sumed 是[ 1,1,1,2,2,3,3,4,5,5 ]。

我們分析一下程式碼。總體思想是：在 hash_value_index_index（對應傳進來的引數是 hash_value_index_count_offset）設定 "按照數目計算的，對應的 embedding 表 index（就是對應的 embedding 錶行號）"。因為embedding_feature 最多隻有5行（nnz個數），所以這裡取前五個即可。

比如，每個block要處理低維稠密矩陣一行。如 bid = 1，它希望更新低維稠密矩陣第2行，但是想知道更新幾次。所以先從 hash_value_index_count_offset[1] 得到了數值 3，然後找到 hash_value_index_sort[3] 來進行處理。

具體是：遍歷grid，但是需要小於nnz（該batch的非零key數目），其實就是 hash_table_value 的行數。比如說nnz這裡等於10，gid 取值就是0～9。grid為0，3，5，7，8 時候new_hash_value_flag[gid] 為 1。hash_value_flag_sumed[gid]分別為：1,2,3,4,5。所以 hash_value_index_count_offset 是 [0, 3, 5, 7, 8, 0, 0, 0, 0, 0]，這些是 hash_value_index_sort 之中的offset。

__global__ void value_count_kernel_2(int nnz, const uint32_t *new_hash_value_flag,
                                     const uint32_t *hash_value_flag_sumed,
                                     uint32_t *hash_value_index_index, uint32_t *counter)

{
  // 遍歷grid，但是需要小於該batch的非零key數目，其實就是 hash_table_value 的行數
  for (int gid = blockIdx.x * blockDim.x + threadIdx.x; gid < nnz; gid += blockDim.x * gridDim.x) {
    uint32_t flag = new_hash_value_flag[gid];
    if (flag == 1) {
      // 設定
      hash_value_index_index[hash_value_flag_sumed[gid] - 1] = gid; 
    }
  }
  if (blockIdx.x * blockDim.x + threadIdx.x == 0) {
    *counter = hash_value_flag_sumed[nnz - 1]; 
    hash_value_index_index[*counter] = nnz; 
  }
}

到目前為止，所有變數如下：

名稱	數值	意義
CSR row offset	0,4,7,9,10	兩個樣本，兩個slot，所以分成四行
CSR value	40,50,10,20,30,50,10,30,20,10	樣本內容
hash_table_value	5 x 8 的矩陣	低維嵌入表，假定稠密向量長度是8，因為一共只有5個不同數字（nnz），所以只有5行
embedding_feature_tensors_	4 x 8 的矩陣	嵌入層輸出的稠密向量。形狀是(batch_size * slot_num) * embedding_vec_len
hash_value_index_tensors_	[4,5,1,2,3,5,1,3,2,1]	低維嵌入表的index，樣本每個key對應一個，比如50對應了 hash_table_value 第5行
sample_id	[0,0,0,0,1,1,1,2,2,3]	每個樣本的每個key 對應了embedding_feature_tensors_ 中的 row index。比如CSR第一行是40,50,10,20，它們都為 embedding_feature_tensors_ 的第一行做出了貢獻。
sample_id_sort	[0,1,3,0,2,1,2,0,0,1]	和 hash_value_index_sort 對應，就是 hash_value_index_sort 前三個 1 分別對應了 embedding_feature 的第1行，第2行，第4行（從0開始的序列）
hash_value_index_sort	[1,1,1,2,2,3,3,4,5,5]	排序之後的結果，舉例來說，1,1,1 意思是本batch之中，一共有3個key對最終embedding_feature第一行做出了貢獻
new_hash_value_flag	[1,0,0,1,0,1,0,1,1,0]	為了計算每個row index對應的sample id 個數。就是找到哪些點是新行起始點
hash_value_flag_sumed	[1,1,1,2,2,3,3,4,5,5]	對 new_hash_value_flag 合併，目的是得到每個group（row index）內部包含多少元素。
hash_value_index_count_offset	[0, 3, 5, 7, 8, 0, 0, 0, 0, 0]	每個block要處理低維稠密矩陣一行。如 bid = 1，它希望更新低維稠密矩陣第2行，但想知道更新幾次。所以先從 hash_value_index_count_offset[1] 得到了數值 3，然後找到 hash_value_index_sort[3]。因為embedding_feature 最多隻有5行（nnz個數），所以這裡取前五個即可

最終思路如下:

每個block要處理低維稠密矩陣一行。假如bid=0 想更新低維矩陣第一行，就是要更新10對應的低維矩陣稠密向量。
bid對應了key（的梯度），比如 40,50,10,20,30,50,10,30,20,10 這些，其key就是10～50這個5個。
hash_value_index_count_offset ：本bid對於低維稠密矩陣該行要更新幾次。sum_num = hash_value_index_count_offset[1] - hash_value_index_count_offset[0] = 3 - 0 = 3個，所以更新3次。
hash_value_index_sort ：在 [1,1,1,2,2,3,3,4,5,5] 這裡找到 1,1,1，表示本batch之中一共有3個key對最終embedding_feature第一行做出了貢獻。
所以 bid = 0 ，就是hash_table_value[0]這一行有三個1，應該更新3次。
sample_id_sort ：更新就是累積，就是這3次更新分別去輸入梯度哪一行去找？3個10分別在梯度的0,1,3這幾行。

6.7 更新權重

這是最後一步，對應瞭如下：

step5: use optimizer method to compute deltaw and update the parameters

呼叫程式碼如下：

注意，這裡傳遞的是 sample_id_sort [0,1,3,0,2,1,2,0,0,1]，對應的 hash_value_index_sort 是 [1,1,1,2,2,3,3,4,5,5]，hash_value_index_count_offset 是 [0, 3, 5, 7, 8, 0, 0, 0, 0, 0]。

case Optimizer_t::AdaGrad: {
  opt_adagrad_kernel<<<grid_size, block_size, 0, stream>>>(
      hash_hash_value_index_count_num, embedding_vec_size, opt_params.lr,
      opt_params.hyperparams.adagrad, opt_tensor.opt_accm_tensors_.get_ptr(),
      sample_id_sort.get_ptr(), hash_value_index_sort.get_ptr(),
      hash_value_index_count_offset.get_ptr(), wgrad.get_ptr(),
      hash_table_value.get_ptr(), opt_params.scaler);
  break;
}

很明顯可以看到，其就是使用權重更新 hash_table_value。

// Local update for the Adagrad optimizer: compute the gradients and update the accumulators and the
// weights
template <typename TypeKey, typename TypeEmbeddingComp>
__global__ void opt_adagrad_kernel(uint32_t hash_value_index_count_num, int embedding_vec_size,
                                   float lr, const AdaGradParams adagrad,
                                   TypeEmbeddingComp *accum_ptr, const TypeKey *sample_id,
                                   const size_t *hash_value_index_sort,
                                   const uint32_t *hash_value_index_count_offset,
                                   const TypeEmbeddingComp *wgrad, float *hash_table_value,
                                   float scaler) {
  int bid = blockIdx.x; // 一個block對應一個樣本之中的一個key，比如例子之中的30
  int tid = threadIdx.x; // 本執行緒

  if (tid < embedding_vec_size && bid < hash_value_index_count_num) {
    // 找到本執行緒樣本在 hash_value_index_sort 的偏移
    uint32_t offset = hash_value_index_count_offset[bid];  // [0, 3, 5, 7, 8, 0, 0, 0, 0, 0]

    // 累積得出梯度
    float gi = accumulate_gradients(embedding_vec_size, sample_id, hash_value_index_count_offset,
                                    wgrad, scaler, offset, bid, tid);

    // 找到本樣本在低維矩陣之中的row index
    size_t row_index = hash_value_index_sort[offset]; // [1,1,1,2,2,3,3,4,5,5]
    // 注意，hash_table_value 是元素級別，比如稠密向量長度是8，那麼在 hash_table_value 裡面就有8個元素
    // feature_index 就是得到本執行緒對應的 embedding vector 之中的哪個元素
    size_t feature_index = row_index * embedding_vec_size + tid;
    
    float accum = //accum_ptr 來自優化器
        TypeConvertFunc<float, TypeEmbeddingComp>::convert(accum_ptr[feature_index]) + gi * gi;

    accum_ptr[feature_index] = TypeConvertFunc<TypeEmbeddingComp, float>::convert(accum);
    float weight_diff = -lr * gi / (sqrtf(accum) + adagrad.epsilon);

    // 更新梯度
    hash_table_value[feature_index] += weight_diff;
  }
}

accumulate_gradients 的邏輯是：

// Helper function to accumulate the weight gradients for a thread's embedding vector
template <typename TypeKey, typename TypeEmbeddingComp>
__device__ __forceinline__ float accumulate_gradients(int embedding_vec_size,
                                                      const TypeKey *sample_id,
                                                      const uint32_t *hash_value_index_count_offset,
                                                      const TypeEmbeddingComp *wgrad, float scaler,
                                                      uint32_t offset, int bid, int tid) {

  // 哪一行更新幾次
  // 如果bid=0,則sum_num = hash_value_index_count_offset[1] - hash_value_index_count_offset[0] = 3 - 0 = 3個。bid對應了key，比如 40,50,10,20,30,50,10,30,20,10 這些key，其key就是10～50這個5個。所以 bid = 0 就是要更新10對應的低維矩陣稠密向量，就是hash_table_value[0]這一行，有三個1，應該更新3次。
  uint32_t sample_num = hash_value_index_count_offset[bid + 1] - hash_value_index_count_offset[bid];

  // 計算梯度
  float gi = 0.0f;
  // sample_id_sort [0,1,3,0,2,1,2,0,0,1] ---- 第幾行，恰恰和 wgrad 對上了
  for (int i = 0; i < sample_num; i++) { // offset 就是0, 3, 5, 7, 8，比如對於第1行，需要更新3次
    // sample_id 是[0,1,3,0,2,1,2,0,0,1]，對應了低維矩陣第1,2,4,...,行，就是3個10分別在輸出稠密向量的哪一行
    // 更新這幾次，就是一個累積，這個累積用哪些梯度來累積。    
    int sample_index = sample_id[offset + i]; // 找到本樣本梯度
    gi += TypeConvertFunc<float, TypeEmbeddingComp>::convert(
        wgrad[sample_index * embedding_vec_size + tid]); // 本執行緒梯度，並且累積
  }
  return gi / scaler;
}

最終具體如下圖：