[原始碼解析] PyTorch 分散式 Autograd (2) ---- RPC基礎

def my_add(t1, t2):
  return torch.add(t1, t2)

def worker0():
    # On worker 0:

    # Setup the autograd context. Computations that take
    # part in the distributed backward pass must be within
    # the distributed autograd context manager.
    with dist_autograd.context() as context_id:
      t1 = torch.rand((3, 3), requires_grad=True)
      t2 = torch.rand((3, 3), requires_grad=True)

      # 第一階段：RPC操作，構建依賴基礎
      
      # Perform some computation remotely.
      t3 = rpc.rpc_sync("worker1", my_add, args=(t1, t2))

      # Perform some computation locally based on remote result.
      t4 = torch.rand((3, 3), requires_grad=True)
      t5 = torch.mul(t3, t4)

      # Compute some loss.
      loss = t5.sum()

      # 第二階段，執行後向傳播
      
      # Run the backward pass.
      dist_autograd.backward(context_id, [loss])

      # Retrieve the gradients from the context.
      dist_autograd.get_gradients(context_id)

      print(loss)

可以用如下辦法來啟動了兩個 worker，其中使用了 rpc.init_rpc 來初始化 rpc。worker0 會啟動，然後利用 RPC 在 worker 1 之上也進行了一些操作。

def run_worker(rank, world_size):
    r"""
    A wrapper function that initializes RPC, calls the function, and shuts down
    RPC.
    """

    # We need to use different port numbers in TCP init_method for init_rpc and
    # init_process_group to avoid port conflicts.
    rpc_backend_options = TensorPipeRpcBackendOptions()
    rpc_backend_options.init_method = "tcp://localhost:29501"

    # Rank 0 and 1 are trainers.
    if rank == 0:
        rpc.init_rpc(
            "worker0",
            rank=rank,
            world_size=world_size,
            rpc_backend_options=rpc_backend_options,
        )
        worker0()

    elif rank == 1:
        rpc.init_rpc(
            "worker1",
            rank=rank,
            world_size=world_size,
            rpc_backend_options=rpc_backend_options,
        )

    # block until all rpcs finish
    rpc.shutdown()

0x02 RPC 基礎

2.1 初始化

我們從頭看看示例程式碼，當指令碼啟動時候，會呼叫到 rpc.init_rpc 來初始化 rpc。從 RPC 註釋中可以看到兩個概念，就是大家常見的 rank 和 world_size。

rank (int): a globally unique id/rank of this node.
world_size (int): The number of workers in the group.

具體初始化程式碼是：

def init_rpc(
    name,
    backend=None,
    rank=-1,
    world_size=None,
    rpc_backend_options=None,
):
        dist_autograd._init(rank) # 我們後續會討論分散式自動微分引擎
        _set_profiler_node_id(rank)
        # Initialize RPC.
        _init_rpc_backend(backend, store, name, rank, world_size, rpc_backend_options)

其中我們關心的是：_init_rpc_backend 會設定後端。

2.1.1 初始化後端

_init_rpc_backend 這裡會依據配置來看看最後生成什麼 Agent，然後把這個代理設定到當前上下文。RPC有兩種後端，TENSORPIPE 和 PROCESS_GROUP，其中PROCESS_GROUP已經被廢棄，會逐漸遷移到TENSORPIPE。

def _init_rpc_backend(
    backend=BackendType.TENSORPIPE,  # 預設後端是TENSORPIPE
    store=None,
    name=None,
    rank=-1,
    world_size=-1,
    rpc_backend_options=None,
):

    _validate_rpc_args(backend, store, name, rank, world_size, rpc_backend_options)

    if _is_current_rpc_agent_set():
        raise RuntimeError("RPC is already initialized")

    # Initialize RPC.
    rpc_agent = backend_registry.init_backend( # 生成一個agent
        backend,
        store=store,
        name=name,
        rank=rank,
        world_size=world_size,
        rpc_backend_options=rpc_backend_options,
    )

    api._init_rpc_states(rpc_agent) # 設定代理到當前上下文

可以看到，預設會生成 TensorPipeAgent。

2.1.2 生成代理

我們接下來看看如何生成 TensorPipeAgent，具體是在 torch/csrc/distributed/rpc/init.cpp。當這裡生成 TensorPipeAgent 時候，把 RequestCallbackImpl 配置為回撥函式。代理內部就用這個回撥函式用來處理接收到的請求。

shared_ptr_class_<TensorPipeAgent>(module, "TensorPipeAgent", rpcAgent)
    .def(
        py::init([](const c10::intrusive_ptr<::c10d::Store>& store,
                    std::string selfName,
                    worker_id_t selfId,
                    int worldSize,
                    c10::intrusive_ptr<::c10d::ProcessGroup> processGroup,
                    TensorPipeRpcBackendOptions opts) {
          return std::shared_ptr<TensorPipeAgent>(
              new TensorPipeAgent(
                  store,
                  std::move(selfName),
                  selfId,
                  worldSize,
                  std::move(processGroup),
                  std::move(opts),
                  std::make_unique<RequestCallbackImpl>()), // RequestCallbackImpl 被配置到 Agent 之上
              impl::destroy_without_gil<TensorPipeAgent>);
        })

具體如下：

+-----------------+        +-----------------------+
| TensorPipeAgent |        | RequestCallbackImpl   |
|                 |        |                       |
|         cb_ +----------> |                       |
|                 |        |                       |
+-----------------+        +-----------------------+

2.1.3 設定代理

_init_rpc_states 會把代理設定在PyTorch環境之中，其定義在 torch/distributed/rpc/api.py 之中有。

def _init_rpc_states(agent):
    worker_infos = agent.get_worker_infos()
    global _ALL_WORKER_NAMES
    _ALL_WORKER_NAMES = {worker_info.name for worker_info in worker_infos}

    # NB: backend implementation might have already set the rpc_agent.
    if not _is_current_rpc_agent_set():
        _set_and_start_rpc_agent(agent)

接下來就要進入了C++世界。在 torch/csrc/distributed/rpc/init.cpp 中有 _set_and_start_rpc_agent，其作用是：

RpcAgent::setCurrentRpcAgent 設定了代理。
呼叫 rpcAgent->start() 來啟動代理。

module.def(
    "_set_and_start_rpc_agent",
    [](const std::shared_ptr<RpcAgent>& rpcAgent) {
        
      RpcAgent::setCurrentRpcAgent(rpcAgent); // 這裡設定了 Agent
        
      // Initializing typeResolver inside RpcAgent constructor will make
      // RpcAgent have python dependency. To avoid RpcAgent to have python
      // dependency, setTypeResolver() here.
        
      std::shared_ptr<TypeResolver> typeResolver =
          std::make_shared<TypeResolver>([&](const c10::QualifiedName& qn) {
            auto typePtr = PythonRpcHandler::getInstance().parseTypeFromStr(
                qn.qualifiedName());
            return c10::StrongTypePtr(
                PythonRpcHandler::getInstance().jitCompilationUnit(),
                std::move(typePtr));
          });
      rpcAgent->setTypeResolver(typeResolver);
      rpcAgent->start(); // 啟動代理
    },
    py::call_guard<py::gil_scoped_release>());

setCurrentRpcAgent 定義在 torch/csrc/distributed/rpc/rpc_agent.cpp 之中。

2.1.4 靜態類變數

在 RpcAgent 之中，有一個靜態成員變數 currentRpcAgent_。

class TORCH_API RpcAgent {
     // 我們省略了其他成員變數和函式
     private:
      static std::shared_ptr<RpcAgent> currentRpcAgent_;
}

在 C++ 之中，靜態成員變數有如下特點：

其屬於整個類所有。
其生命期不依賴於任何物件，為程式的生命週期。
可以通過類名直接訪問公有靜態成員變數。
可以通過物件名訪問一個類的公有靜態成員變數。
類的所有派生物件共享該類的靜態成員變數。
靜態成員變數需要在該類外單獨分配空間。
靜態成員變數在程式內部位於全域性資料區。

所以，我們可知RpcAgent::currentRpcAgent_ 可以認為就是全域性變數，rpc 統一使用這個變數進行協調。具體通過 RpcAgent 的一些公有成員函式來完成這些功能。

std::shared_ptr<RpcAgent> RpcAgent::currentRpcAgent_ = nullptr;

bool RpcAgent::isCurrentRpcAgentSet() {
  return std::atomic_load(&currentRpcAgent_) != nullptr;
}

std::shared_ptr<RpcAgent> RpcAgent::getCurrentRpcAgent() {
  std::shared_ptr<RpcAgent> agent = std::atomic_load(&currentRpcAgent_);
  return agent;
}

void RpcAgent::setCurrentRpcAgent(std::shared_ptr<RpcAgent> rpcAgent) {
  if (rpcAgent) {
    std::shared_ptr<RpcAgent> previousAgent;
    // Use compare_exchange so that we don't actually perform the exchange if
    // that would trigger the assert just below. See:
    // https://en.cppreference.com/w/cpp/atomic/atomic_compare_exchange
    std::atomic_compare_exchange_strong(
        &currentRpcAgent_, &previousAgent, std::move(rpcAgent));
  } else {
    // We can't use compare_exchange (we don't know what value to expect) but we
    // don't need to, as the only case that would trigger the assert is if we
    // replaced nullptr with nullptr, which we can just do as it has no effect.
    std::shared_ptr<RpcAgent> previousAgent =
        std::atomic_exchange(&currentRpcAgent_, std::move(rpcAgent));
  }
}

於是目前擴充如下，以後進行 RPC 操作，都會通過 RpcAgent::currentRpcAgent_ 這個全域性變數進行。

RpcAgent::currentRpcAgent_
      +
      |
      |
      |
      v
+-----+-----------+        +-----------------------+
| TensorPipeAgent |        | RequestCallbackImpl   |
|                 |        |                       |
|         cb_ +----------> |                       |
|                 |        |                       |
+-----------------+        +-----------------------+

2.2 RPC 代理

dist.autograd 的相關功能都是基於 RPC 代理完成，所以我們需要仔細看看代理。

2.2.1 RpcAgent

這是用來傳遞RPC的代理，是收發 RPC訊息的代理基類，其：

提供了send API用來處理request 和 response。
也配置了 cb_ 用來處理接收到的請求。

WorkerInfo 是代理例項所在 worker 的全域性唯一標示，包括name_和id_這兩個成員變數。name_是全域性唯一名字，id_是全域性唯一ID。

class TORCH_API RpcAgent {
 public:
  RpcAgent(
      WorkerInfo id,
      std::unique_ptr<RequestCallback> cb,
      std::chrono::milliseconds rpcTimeout);
  
  // 給 to.id 代表的其他 RpcAgengt 傳送一個訊息，返回一個JitFuture，這個實現是非同步的。
  virtual c10::intrusive_ptr<JitFuture> send(
      const WorkerInfo& to.id,
      Message&& message,
      const float rpcTimeoutSeconds = kUnsetRpcTimeout,
      const std::unordered_map<c10::Device, c10::Device>& deviceMap = {}) = 0;

 protected:
  const WorkerInfo workerInfo_; // 代理例項的全域性唯一標示
  const std::unique_ptr<RequestCallback> cb_; // 回撥函式
  std::atomic<std::chrono::milliseconds> rpcTimeout_;
  std::atomic<bool> profilingEnabled_;
  std::shared_ptr<TypeResolver> typeResolver_;
  std::atomic<bool> rpcAgentRunning_;

 private:
  static std::shared_ptr<RpcAgent> currentRpcAgent_; // 全域性代理
  // Add GIL wait time data point to metrics
  virtual void addGilWaitTime(const std::chrono::microseconds gilWaitTime) = 0;
  friend class PythonRpcHandler;
  // Condition Variable to signal when the rpcRetryMap_ has been populated.
  std::condition_variable rpcRetryMapCV_;
  // Mutex to protect RpcRetryMap_.
  std::mutex rpcRetryMutex_;
};

2.2.2 ProcessGroupAgent

ProcessGroupAgent 是 RpcAgent 的派生類。這是之前使用的，但是 PyTorch 提供了更優秀的 TensorAgent。我們只選取了部分成員變數。

class TORCH_API ProcessGroupAgent : public RpcAgent {
 public:

  c10::intrusive_ptr<::c10d::ProcessGroup> pg_;
  // worker name -> rank
  std::unordered_map<std::string, worker_id_t> nameMap_;
  std::vector<WorkerInfo> allWorkerInfo_;

  MessageCounter sendCounts_;
  MessageCounter recvCounts_;

  std::atomic<int64_t> nextId_;

  std::thread listenerThread_;
  std::thread futureTimeoutThread_;
  c10::intrusive_ptr<c10d::ProcessGroup::Work> recvWork_;

  std::unordered_map<
      worker_id_t,
      std::set<c10::intrusive_ptr<c10d::ProcessGroup::Work>>>
      currentPendingSends_;

  ThreadPool threadPool_;

  // Mapping of request id to FutureInfo struct.
  std::unordered_map<int64_t, FutureInfo> futures_;
};

2.2.3 TensorPipeAgent

TensorPipeAgent 定義在 torch/csrc/distributed/rpc/tensorpipe_agent.h，這是目前和未來使用的。TensorPipeAgent利用TensorPipe在可用傳輸或通道之中透明地移動張量和資料。它就像一個混合的RPC傳輸，提供共享記憶體（linux）和TCP（linux&mac）支援。PyTorch 正在開發其支援CUDA版本。

我們只選取了部分成員變數。

// TensorPipeAgent leverages TensorPipe (https://github.com/pytorch/tensorpipe)
// to transparently move tensors and payloads through the fastest available
// transport or channel. It acts like a hybrid RPC transport, providing shared
// memory (linux) and TCP (linux & mac) support. CUDA support is in progress.
class TensorPipeAgent : public RpcAgent {
 public:
  TensorPipeAgent(
      const c10::intrusive_ptr<::c10d::Store>& store,
      std::string selfName,
      worker_id_t selfId,
      int worldSize,
      c10::intrusive_ptr<::c10d::ProcessGroup> processGroup,
      TensorPipeRpcBackendOptions opts,
      std::unique_ptr<RequestCallback> cb);

  const TensorPipeRpcBackendOptions opts_;
  std::unordered_map<std::string, DeviceMap> reverseDeviceMaps_;
  std::vector<c10::Device> devices_;

  ThreadPool threadPool_;
  std::shared_ptr<tensorpipe::Context> context_;
  std::shared_ptr<tensorpipe::Listener> listener_;

  mutable std::mutex connectedPipesMutex_;
  std::unordered_map<worker_id_t, ClientPipe> connectedPipes_;

  // Maps keyed on name and id for easy WorkerInfo lookup.
  std::unordered_map<worker_id_t, WorkerInfo> workerIdToInfo_;
  std::unordered_map<std::string, WorkerInfo> workerNameToInfo_;
  std::unordered_map<std::string, std::string> workerNameToURL_;

  ::c10d::PrefixStore rankToNameStore_;
  ::c10d::PrefixStore nameToAddressStore_;
  const int worldSize_;

  // The join method is required to behave like a barrier and perform collective
  // operations. For simplicity and reliability, we offload this to a process
  // group, but probably one day we might want to re-implement them using RPCs.
  const c10::intrusive_ptr<::c10d::ProcessGroup> processGroup_;

  std::atomic<uint64_t> nextMessageID_{0};

  // Thread that will poll the timeoutMap_ for timed out messages and mark them
  // with an error accordingly
  std::thread timeoutThread_;

  // Function run by the timeoutThread_ to check for timed out RPCs
  void pollTimeoutRpcs();
};

2.2.4 回撥函式

Agent 在收到訊息時候，會呼叫回撥函式。而 RequestCallbackImpl 實現了回撥邏輯。RequestCallbackImpl 是派生類，我們先來看看基類 RequestCallbackNoPython，結果找到了RequestCallback 這個介面，所以 RequestCallback 才是這個派生體系的基礎。

class TORCH_API RequestCallbackNoPython : public RequestCallback
  
class TORCH_API RequestCallbackImpl : public RequestCallbackNoPython

2.2.4.1 RequestCallback

RequestCallback 是處理 RPC 訊息的介面，是一個抽象類。

// Functor which is invoked to process an RPC message. This is an abstract class
// with some common functionality across all request handlers. Users need to
// implement this interface to perform the actual business logic.
class TORCH_API RequestCallback {
 public:
  // Invoke the callback.
  c10::intrusive_ptr<JitFuture> operator()(
      Message& request,
      std::shared_ptr<LazyStreamContext> ctx) const;

  // NOLINTNEXTLINE(modernize-use-equals-default)
  virtual ~RequestCallback() {}

 protected:
  // RpcAgent implementation should invoke ``RequestCallback`` to process
  // received requests. There is no restriction on the implementation's
  // threading model. This function takes an rvalue reference of the Message
  // object. It is expected to return the future to a response message or
  // message containing an exception. Different rpc agent implementations are
  // expected to ensure delivery of the response/exception based on their
  // implementation specific mechanisms.
  virtual c10::intrusive_ptr<JitFuture> processMessage(
      Message& request,
      std::shared_ptr<LazyStreamContext> ctx) const = 0;
};

2.2.4.2 RequestCallbackNoPython

RequestCallbackNoPython 的定義在 torch/csrc/distributed/rpc/request_callback_no_python.h，其實現了一些處理機制，因為其包含太多方法，我們只能摘錄部分，如果有興趣的朋友請深入研究。

// RequestCallback implementation with no Python dependencies.
class TORCH_API RequestCallbackNoPython : public RequestCallback {
 public:
  c10::intrusive_ptr<JitFuture> processMessage(
      Message& request,
      std::shared_ptr<LazyStreamContext> ctx) const override;

 protected:

  void processForwardAutogradReq(
      RpcCommandBase& rpc,
      const int64_t messageId,
      const c10::intrusive_ptr<JitFuture>& responseFuture,
      std::shared_ptr<LazyStreamContext> ctx) const;

  void processBackwardAutogradReq(
      RpcCommandBase& rpc,
      const int64_t messageId,
      const c10::intrusive_ptr<JitFuture>& responseFuture) const;

  void processRpc(
      RpcCommandBase& rpc,
      const MessageType& messageType,
      const int64_t messageId,
      const c10::intrusive_ptr<JitFuture>& responseFuture,
      std::shared_ptr<LazyStreamContext> ctx) const;

  virtual void processRpcWithErrors(
      RpcCommandBase& rpc,
      const MessageType& messageType,
      const int64_t messageId,
      const c10::intrusive_ptr<JitFuture>& responseFuture,
      std::shared_ptr<LazyStreamContext> ctx) const;

  virtual void processRRefBackward(
      RpcCommandBase& rpc,
      const int64_t messageId,
      const c10::intrusive_ptr<JitFuture>& responseFuture) const;
};

我們會在後續分析接受邏輯時候，看到如何呼叫到回撥函式。

0x03 傳送邏輯

我們先來看看傳送邏輯。也就是 rpc.rpc_sync 的作用：建立 root，新增 send等。

3.1 Python

我們從 python 部分開始。

# Perform some computation remotely.
t3 = rpc.rpc_sync("worker1", my_add, args=(t1, t2))

首先來到 rpc_sync，發現其呼叫了_invoke_rpc。

@_require_initialized
def rpc_sync(to, func, args=None, kwargs=None, timeout=UNSET_RPC_TIMEOUT):
    fut = _invoke_rpc(to, func, RPCExecMode.SYNC, args, kwargs, timeout)
    return fut.wait()

其次來到_invoke_rpc，可以看到此函式依據呼叫型別不同（內建操作，script，udf這三種），選擇了不同路徑。

def _invoke_rpc(to, func, rpc_type, args=None, kwargs=None, rpc_timeout=UNSET_RPC_TIMEOUT):
    qualified_name = torch.jit._builtins._find_builtin(func)
    dst_worker_info = _to_worker_info(to)
    should_profile = torch.autograd._profiler_enabled()
    ctx_manager = _enable_rpc_profiler(should_profile, qualified_name, func, rpc_type, dst_worker_info)

    with ctx_manager as rf:
        args = args if args else ()
        kwargs = kwargs if kwargs else {}

        is_async_exec = hasattr(func, "_wrapped_async_rpc_function")

        if is_async_exec:
            wrapped = func._wrapped_async_rpc_function
            if isinstance(wrapped, torch.jit.ScriptFunction):
                func = wrapped

        if qualified_name is not None:
            fut = _invoke_rpc_builtin( # 內建rpc
                dst_worker_info,
                qualified_name,
                rpc_timeout,
                *args,
                **kwargs
            )
        elif isinstance(func, torch.jit.ScriptFunction): # 指令碼
            fut = _invoke_rpc_torchscript( 
                dst_worker_info.name,
                torch._jit_internal._qualified_name(func),
                args,
                kwargs,
                rpc_timeout,
                is_async_exec
            )
        else:
            (pickled_python_udf, tensors) = _default_pickler.serialize(
                PythonUDF(func, args, kwargs)
            )
            fut = _invoke_rpc_python_udf( # 使用者udf
                dst_worker_info,
                pickled_python_udf,
                tensors,
                rpc_timeout,
                is_async_exec
            )
        if should_profile:
            fut = rf._call_end_callbacks_on_future(fut)
    return fut

從這裡開始就進入到了C++世界，torch/csrc/distributed/rpc/init.cpp。

3.2 C++

這裡可以看到 _invoke_rpc_builtin 對應了 pyRpcBuiltin，_invoke_rpc_python_udf 對應了 pyRpcPythonUdf。

PyObject* rpc_init(PyObject* _unused, PyObject* noargs) {
  module.def(
      "_invoke_rpc_builtin",
      [](const WorkerInfo& dst,
         const std::string& opName,
         const float rpcTimeoutSeconds,
         const py::args& args,
         const py::kwargs& kwargs) {
        return std::make_shared<jit::PythonFutureWrapper>(
            pyRpcBuiltin(dst, opName, args, kwargs, rpcTimeoutSeconds)); # 內建函式
      },
      py::call_guard<py::gil_scoped_acquire>());

  module.def(
      "_invoke_rpc_python_udf",
      [](const WorkerInfo& dst,
         std::string& pickledPythonUDF,
         std::vector<torch::Tensor>& tensors,
         const float rpcTimeoutSeconds,
         const bool isAsyncExecution) {
        return std::make_shared<jit::PythonFutureWrapper>(pyRpcPythonUdf(
            dst,
            pickledPythonUDF, # 對應了udf
            tensors,
            rpcTimeoutSeconds,
            isAsyncExecution));
      },
      py::call_guard<py::gil_scoped_release>());  
  
  # 省略其他
}

我們選用 _invoke_rpc_builtin 對應的 pyRpcBuiltin 來看看。

3.2.1 pyRpcBuiltin

在 torch/csrc/distributed/rpc/python_functions.cpp可以看到，pyRpcBuiltin 會呼叫到 sendMessageWithAutograd。

c10::intrusive_ptr<JitFuture> pyRpcBuiltin(
    const WorkerInfo& dst,
    const std::string& opName,
    const py::args& args,
    const py::kwargs& kwargs,
    const float rpcTimeoutSeconds) {
  DCHECK(PyGILState_Check());
  Stack stack;
  auto op = matchBuiltinOp(opName, args, kwargs, stack);
  // Release GIL since args and kwargs processing is done.
  py::gil_scoped_release release;
  auto scriptCall = std::make_unique<ScriptCall>(op, std::move(stack));
  auto agent = RpcAgent::getCurrentRpcAgent(); // 獲取當前agent
  return toPyJitFuture(sendMessageWithAutograd( // 傳送請求
      *agent,
      dst,
      std::move(*scriptCall).toMessage(),
      false,
      rpcTimeoutSeconds));
}

3.2.2 sendMessageWithAutograd

在 torch/csrc/distributed/autograd/utils.cpp 這裡利用 agent 來進行傳送 FORWARD_AUTOGRAD_REQ。

後面在接收方，我們將會看到處理 FORWARD_AUTOGRAD_REQ 訊息，因此傳送和接受大致可以聯絡起來。

c10::intrusive_ptr<JitFuture> sendMessageWithAutograd(
    RpcAgent& agent,
    const WorkerInfo& dst,
    torch::distributed::rpc::Message&& wrappedRpcMsg,
    bool forceGradRecording,
    const float rpcTimeoutSeconds,
    bool forceDisableProfiling) {
  auto msg = getMessageWithAutograd( // 這裡會與上下文互動，構建了 FORWARD_AUTOGRAD_REQ
      dst.id_,
      std::move(wrappedRpcMsg),
      MessageType::FORWARD_AUTOGRAD_REQ,
      forceGradRecording,
      agent.getDeviceMap(dst));

  c10::intrusive_ptr<JitFuture> fut;
  // If profiler is enabled, wrap this message with profiling metadata that will
  // tell the remote end to process this request with the profiler enabled.
  if (!forceDisableProfiling && torch::autograd::profiler::profilerEnabled()) {
    auto profilerConfig = torch::autograd::profiler::getProfilerConfig();
    auto msgWithProfiling = getMessageWithProfiling(
        std::move(msg),
        rpc::MessageType::RUN_WITH_PROFILING_REQ, //構建訊息
        std::move(profilerConfig));
    // 傳送訊息
    fut = agent.send(dst, std::move(msgWithProfiling), rpcTimeoutSeconds);
  } else {
    fut = agent.send(dst, std::move(msg), rpcTimeoutSeconds);
  }

  return fut;
}

傳送流程如下，其中 sendMessageWithAutograd 會使用 RpcAgent::getCurrentRpcAgent() 得到 RpcAgent::currentRpcAgent_，就是得到了全域性設定的代理，然後通過代理進行傳送。

  rpc.rpc_sync
         +
         |
         |
         v
  _invoke_rpc_builtin
         +
         |                                               Python
+---------------------------------------------------------------+
         |                                               C++
         |
         v

    pyRpcBuiltin
         +
         |
         |
         v

 sendMessageWithAutograd(RpcAgent::getCurrentRpcAgent())
         +
         |
         |
         |   RpcAgent::currentRpcAgent_
         |           +
         |           |
         |           |
         |           v
         |     +-----+-----------+
         |     | TensorPipeAgent |        +-----------------------+
         |     |                 |        | RequestCallbackImpl   |
         |     |       cb_ +------------> |                       |
         |     |                 |        +-----------------------+
         |     |                 |
         |     |                 |
         +-----------> send +-----------> Will send message to other worker
               |                 |
               |                 |
               +-----------------+

0x04 接受邏輯

4.1 回撥

當Agent接受到訊息之後，會呼叫到RequestCallback::operator()。就是我們前面所說的回撥函式。程式碼位於 torch/csrc/distributed/rpc/tensorpipe_agent.cpp。

void TensorPipeAgent::respond(std::shared_ptr<tensorpipe::Pipe>& pipe) {
  pipeRead(
      pipe,
      [this, pipe](
          const tensorpipe::Error& error,
          Message&& requestMessage,
          std::shared_ptr<LazyStreamContext> ctx) mutable {

        // Arm for next read
        respond(pipe);

        uint64_t messageId = requestMessage.id();
        increaseCallCount(serverActiveCalls_);

        // Defer user RPC UDF run to thread pool
        threadPool_.run([this,
                         pipe,
                         messageId,
                         requestMessage{std::move(requestMessage)},
                         ctx{std::move(ctx)}]() mutable {

          c10::intrusive_ptr<JitFuture> futureResponseMessage;
          try {
              
            // 這裡會呼叫 RequestCallback 來進行回撥邏輯處理
              
            futureResponseMessage = cb_->operator()(requestMessage, ctx);
            
          } catch (const std::exception& /* unused */) {
            futureResponseMessage =
                c10::make_intrusive<JitFuture>(at::AnyClassType::get());
            futureResponseMessage->setError(std::current_exception());
          }

          // Shortcut if immediately done
          if (futureResponseMessage->completed()) {
            decreaseCallCount(serverActiveCalls_);
            sendCompletedResponseMessage(
                pipe, *futureResponseMessage, messageId, std::move(ctx));
          } else {
            // Not complete yet
            increaseCallCount(serverActiveAsyncCalls_);
            futureResponseMessage->addCallback(
                [this, pipe, messageId, ctx{std::move(ctx)}](
                    JitFuture& futureResponseMessage) mutable {
                  decreaseCallCount(serverActiveCalls_);
                  decreaseCallCount(serverActiveAsyncCalls_);
                  sendCompletedResponseMessage(
                      pipe, futureResponseMessage, messageId, std::move(ctx));
                });
          }
        });
      });
}

4.2 operator()

operator() 之中會呼叫 processMessage 處理訊息。

c10::intrusive_ptr<JitFuture> RequestCallback::operator()(
    Message& request,
    std::shared_ptr<LazyStreamContext> ctx) const {
  // NB: cannot clear autograd context id here because the processMessage method
  // might pause waiting for all RRefs in the arguments to be confirmed by their
  // owners and resumne processing in a different thread. Hence, the
  // thread_local context id needs to be set and cleared in the thread that
  // indeed carries out the processing logic.
  return processMessage(request, std::move(ctx));
}

隨後，會呼叫到 RequestCallbackNoPython::processMessage 之中。

先呼叫 RequestCallbackImpl 中實現的 deserializePythonRpcCommand 來對 PythonUDF 反序列化。
然後呼叫 processRpcWithErrors 來處理訊息。

c10::intrusive_ptr<JitFuture> RequestCallbackNoPython::processMessage(
    Message& request,
    std::shared_ptr<LazyStreamContext> ctx) const {
  // We need two futures here because it could pause twice when processing a
  // RPC message:
  //  1) waiting for all RRefs in the arguments to become confirmed;
  //  2) waiting for processRpc to finish.
  auto retFuture = c10::make_intrusive<JitFuture>(at::AnyClassType::get());
  auto& rrefContext = RRefContext::getInstance();
  try {
    rrefContext.recordThreadLocalPendingRRefs();
    // Deserialize PythonUDF here to trigger RRef unpickling
    // 呼叫 RequestCallbackImpl 中實現的  deserializePythonRpcCommand 來對 PythonUDF 反序列化
    std::unique_ptr<RpcCommandBase> rpc = deserializePythonRpcCommand(
        deserializeRequest(request), request.type()); // 解析請求
    auto rrefsReadyFuture = rrefContext.waitForThreadLocalPendingRRefs();

    rrefsReadyFuture->addCallback(
        [this,
         retFuture,
         // std::function must be copyable, hence hae to cast the unique_ptr to
         // a shared_ptr here.
         rpc = (std::shared_ptr<RpcCommandBase>)std::move(rpc),
         messageType = request.type(),
         id = request.id(),
         ctx = std::move(ctx)](JitFuture& /* unused */) mutable {
          c10::MultiStreamGuard guard(
              ctx ? ctx->getReservedStreams() : ArrayRef<Stream>({}));
          // The cost of pre-request check is minimal thanks to
          // std::shared_lock. The cost is in magnitude
          // of 10us.
          auto serverProcessGlobalProfilerStateStackEntryPtr =
              profiler::processglobal::StateStackEntry::current();
          // If server global profiler is enabled, we futher pay the
          // cost of thread local profiler state initialization.
          if (serverProcessGlobalProfilerStateStackEntryPtr) {
            // Initialize thread-local profiler state from process-global
            // profiler state.
            ::torch::autograd::profiler::enableProfilerLegacy(
                serverProcessGlobalProfilerStateStackEntryPtr->statePtr()
                    ->config());
          }

          // 在這裡
          processRpcWithErrors(
              *rpc, messageType, id, retFuture, std::move(ctx));

          // Response message has been sent at this moment, this post-response
          // work doesn't affect RPC trip time.
          if (serverProcessGlobalProfilerStateStackEntryPtr) {
            // Restore thread-local profiler state.
            ::torch::autograd::profiler::thread_event_lists event_lists =
                ::torch::autograd::profiler::disableProfilerLegacy();
            // Put thread_local event_lists into the process-global profiler
            // state.
            profiler::processglobal::pushResultRecursive(
                serverProcessGlobalProfilerStateStackEntryPtr, event_lists);
          }
        });
  } catch (std::exception& e) {
    retFuture->markCompleted(handleError(e, request.type(), request.id()));
    rrefContext.clearRecordedPendingRRefsOnError();
  }
  return retFuture;
}

然後呼叫到 processRpcWithErrors。

void RequestCallbackNoPython::processRpcWithErrors(
    RpcCommandBase& rpc,
    const MessageType& messageType,
    const int64_t messageId,
    const c10::intrusive_ptr<JitFuture>& responseFuture,
    std::shared_ptr<LazyStreamContext> ctx) const {
  try {
    processRpc(rpc, messageType, messageId, responseFuture, std::move(ctx));
  } catch (std::exception& e) {
    responseFuture->markCompleted(handleError(e, messageType, messageId));
  }
}

接下來是 processRpc。這裡能夠看到處理 FORWARD_AUTOGRAD_REQ。

void RequestCallbackNoPython::processRpc(
    RpcCommandBase& rpc,
    const MessageType& messageType,
    const int64_t messageId,
    const c10::intrusive_ptr<JitFuture>& responseFuture,
    std::shared_ptr<LazyStreamContext> ctx) const {

    case MessageType::FORWARD_AUTOGRAD_REQ: { // 這裡就和之前傳送的對應上了
      processForwardAutogradReq(rpc, messageId, responseFuture, std::move(ctx));
      return;
    }
    case MessageType::BACKWARD_AUTOGRAD_REQ: {
      processBackwardAutogradReq(rpc, messageId, responseFuture);
      return;
    };  
  
}

具體如下：

 TensorPipeAgent      RequestCallback  RequestCallbackNoPython     RequestCallbackImpl
        +                   +                 +                          +
        |                   |                 |                          |
        |                   |                 |                          |
        v                   |                 |                          |
    respond                 |                 |                          |
        +                   |                 |                          |
        |                   |                 |                          |
        |                   |                 |                          |
        v                   v                 v                          |
cb_->operator()  +-->   operator()  +-->  processMessage                 |
                                              +                          |
                                              |                          |
                                              |                          v
                                              +--------------->  deserializePythonRpcCommand
                                              |
                                              |
                                              |
                                              v

                                      processRpcWithErrors
                                              +
                                              |
                                              |
                                              v
                                          processRpc
                                              +
                                              |
                                              |
                                              v
                                    processForwardAutogradReq

4.3 RequestCallbackImpl

這時候，讀者會有疑問，之前 TensorPipeAgent 明明設定了 RequestCallbackImpl 作為回撥函式，怎麼只呼叫了其 deserializePythonRpcCommand呢，deserialXXX 看起來是序列化相關的，按說應該呼叫一些業務處理函式，比如processXXXX 之類的。我們接下來就看看 RequestCallbackImpl。

RequestCallbackImpl 定義在 torch/csrc/distributed/rpc/request_callback_impl.h。

class TORCH_API RequestCallbackImpl : public RequestCallbackNoPython {
 public:
  std::unique_ptr<RpcCommandBase> deserializePythonRpcCommand(
      std::unique_ptr<RpcCommandBase> rpc,
      const MessageType& messageType) const override;

  void processPythonCall(
      RpcCommandBase& rpc,
      const std::function<void(Message)>& markComplete,
      const int64_t messageId,
      const c10::intrusive_ptr<JitFuture>& responseFuture) const override;

  void processScriptCall(
      RpcCommandBase& rpc,
      const std::function<void(Message)>& markComplete,
      const int64_t messageId,
      const c10::intrusive_ptr<JitFuture>& responseFuture) const override;

  void processScriptRemoteCall(
      ScriptRemoteCall& scriptRemoteCall,
      const std::function<void(void)>& postProcessing,
      std::vector<at::IValue>& stack,
      const c10::intrusive_ptr<OwnerRRef>& ownerRRef) const override;

  void processPythonRemoteCall(
      RpcCommandBase& rpc,
      const std::function<void(Message)>& markComplete,
      const int64_t messageId,
      const c10::intrusive_ptr<JitFuture>& responseFuture,
      std::shared_ptr<LazyStreamContext> ctx) const override;

  void processRpcWithErrors(
      RpcCommandBase& rpc,
      const MessageType& messageType,
      const int64_t messageId,
      const c10::intrusive_ptr<JitFuture>& responseFuture,
      std::shared_ptr<LazyStreamContext> ctx) const override;

  void processRRefBackward(
      RpcCommandBase& rpc,
      const int64_t messageId,
      const c10::intrusive_ptr<JitFuture>& responseFuture) const override;
};

因為最終生成的是 RequestCallbackImpl，所以實際上，上圖中間有一步 processRpcWithErrors 實際呼叫的是 RequestCallbackImpl 這裡的函式 processRpcWithErrors，其就是增加了一些異常處理邏輯。

void RequestCallbackImpl::processRpcWithErrors(
    RpcCommandBase& rpc,
    const MessageType& messageType,
    const int64_t messageId,
    const c10::intrusive_ptr<JitFuture>& responseFuture,
    std::shared_ptr<LazyStreamContext> ctx) const {
  try {
    processRpc(rpc, messageType, messageId, responseFuture, std::move(ctx));
  } catch (py::error_already_set& e) {
    responseFuture->markCompleted(handleError(e, messageType, messageId));
    py::gil_scoped_acquire acquire;
    e.restore(); // Release ownership on py::objects and also restore
                 // Python Error Indicator.
    PyErr_Clear(); // Clear the Python Error Indicator as we has
                   // recorded the exception in the response message.
  } catch (std::exception& e) {
    responseFuture->markCompleted(handleError(e, messageType, messageId));
  }
}

邏輯圖修改如下：

 TensorPipeAgent      RequestCallback  RequestCallbackNoPython     RequestCallbackImpl
        +                   +                 +                          +
        |                   |                 |                          |
        |                   |                 |                          |
        v                   |                 |                          |
    respond                 |                 |                          |
        +                   |                 |                          |
        |                   |                 |                          |
        |                   |                 |                          |
        v                   v                 v                          |
cb_->operator()  +-->   operator()  +-->  processMessage                 |
                                              +                          |
                                              |                          |
                                              |                          v
                                              +----------------> deserializePythonRpcCommand
                                              |                          +
                                              |                          |
                                              |                          |
                                              |                          v
                                              |
                                              +----------------> processRpcWithErrors
                                              |                          +
                                              |                          |
                                              |                          |
                                              | <------------------------+
                                              |
                                              |
                                              v
                                          processRpc
                                              +
                                              |
                                              |
                                              v
                                    processForwardAutogradReq

如果結合之前的傳送，我們擴充圖例如下：

當傳送者需要在遠端執行自動梯度計算時候，呼叫 rpc.rpc_sync。
從 Python 呼叫到 C++ 世界，函式為 pyRpcBuiltin。
呼叫 sendMessageWithAutograd，以此通知Receiver。
會呼叫 RpcAgent::getCurrentRpcAgent() 來得到本地的 Agent。
呼叫 current Agent 的 send 函式。
send 函式傳送 FORWARD_AUTOGRAD_REQ給 Receiver worker。
respond 函式會呼叫 Receiver 之中 Agent 的回撥函式 cb_。
呼叫到 RequestCallbackImpl 的 processRpcWithErrors。
然後呼叫 processRpc。
最後呼叫到 processForwardAutogradReq，完成了基於RPC的分散式autograd的啟動過程。

                                                             +
 rpc.rpc_sync                                 Sender         |     Receiver
        +                                                    |
        |                                                    |
        | 1                                                  |
        v                                                    |
 _invoke_rpc_builtin                                         |
        +                                                    |
        |                                      Python        |
+----------------------------------------------------------+ |
        |                                      C++           |      +----------------------------+
        |  2                                                 |      | RequestCallbackImpl        |
        v                                                    |      |                            |
                                                             |   +----> processRpcWithErrors     |
   pyRpcBuiltin                                              |   |  |             +              |
        +                                                    |   |  |             | 9            |
        |  3                                                 |   |  |             |              |
        |                                                    |   |  |             v              |
        v                                                    |   |  |         processRpc         |
                                     4                       |   |  |             +              |
sendMessageWithAutograd(RpcAgent::getCurrentRpcAgent())      |   |  |             | 10           |
        +                                                    |   |  |             |              |
        |                                                    |   |  |             v              |
        |                                                    |   |  |  processForwardAutogradReq |
        |   RpcAgent::currentRpcAgent_                       |   |  |                            |
        |           +                                        |   |  +----------------------------+
        |           |                                        |   |
        | 5         |                                        |   |8     +-----------------+
        |           v                                        |   |      | TensorPipeAgent |
        |    +------+--------+                               |   |      |                 |
        |    |TensorPipeAgent|   +-------------------+       |   +------------+ cb_       |
        |    |               |   |RequestCallbackImpl|       |          |        ^        |
        |    |      cb_ +------->+                   |       |          |      7 |        |
        |    |               |   +-------------------+       |          |        |        |
        |    |               |                          6    |          |        +        |
        +--------> send   +----------------------------------+--------------> respond     |
             |               |                   FORWARD_AUTOGRAD_REQ   |                 |
             |               |                               +          |                 |
             +---------------+                               |          +-----------------+
                                                             +

手機如下：

至此，RPC介紹完畢，我們下一篇介紹上下文相關等管理類，敬請期待。

[原始碼解析] PyTorch 分散式 Autograd (2) ---- RPC基礎

[原始碼解析] PyTorch 分散式 Autograd (2) ---- RPC基礎

0x00 摘要

0x01 示例

0x02 RPC 基礎

2.1 初始化

2.1.1 初始化後端

2.1.2 生成代理

2.1.3 設定代理

2.1.4 靜態類變數

2.2 RPC 代理

2.2.1 RpcAgent

2.2.2 ProcessGroupAgent

2.2.3 TensorPipeAgent

2.2.4 回撥函式

2.2.4.1 RequestCallback

2.2.4.2 RequestCallbackNoPython

0x03 傳送邏輯

3.1 Python

3.2 C++

3.2.1 pyRpcBuiltin

3.2.2 sendMessageWithAutograd

0x04 接受邏輯

4.1 回撥

4.2 operator()

4.3 RequestCallbackImpl

0xFF 參考

相關文章