【Remotery】 Remotery-輕量級的遠端實時 CPU/GPU 分析器設計淺析

Emma1111發表於2024-07-22

1.Remotery簡介

Remotery是一個輕量級的遠端實時CPU/GPU分析器,主要用於監控CPU和GPU上多執行緒的活動。它提供了一個C檔案,可以很容易的整合到專案中,並配置一個實時監控的Web介面,可以透過遠端觀察和分析程式的效能,適用於監控遊戲的實時執行效能和分析移動端應用的效能場景。

image

2.Remotery編譯執行

Remotery可以定義一些額外的宏來修改要編譯到Remotery中的功能:

Macro               Default     Description

    RMT_ENABLED         1           Disable this to not include any bits of Remotery in your build
    RMT_USE_TINYCRT     0           Used by the Celtoys TinyCRT library (not released yet)
    RMT_USE_CUDA        0           Assuming CUDA headers/libs are setup, allow CUDA profiling
    RMT_USE_D3D11       0           Assuming Direct3D 11 headers/libs are setup, allow D3D11 GPU profiling
    RMT_USE_OPENGL      0           Allow OpenGL GPU profiling (dynamically links OpenGL libraries on available platforms)
    RMT_USE_METAL       0           Allow Metal profiling of command buffers

2.1 Linux平臺下

Linux(GCC):在lib資料夾中新增原始碼。編譯程式碼需要-pthreads來進行庫連結。例如,編譯Remotery庫本身並執行:

cc lib/Remotery.c sample/sample.c -I lib -pthread -lm

編譯後,會在目錄下生成一個a.out可執行檔案,./a.out後,雙擊vis/index.html,即可看到執行緒執行情況
image

2.2 Windows平臺下

Windows(MSVC)-將lib/Remotery.c和lib/Remotery.h新增到程式中。設定include目錄以新增Remotery/lib路徑。所需的庫ws2_32.lib應該透過在Remotery.c中使用#pragma comment(lib,ws2_32.lib)指令來獲取。

3.Remotery各功能類簡介

3.1 基礎功能類

3.1.1 New

帶有錯誤值的新建/刪除運算子,用於簡化物件建立/銷燬

// Ensures the pointer is non-NULL, calls the destructor, frees memory and sets the pointer to NULL
#define Delete(type, obj)           \
    if (obj != NULL)                \
    {                               \
        type##_Destructor(obj);     \
        rmtFree(obj);               \
        obj = NULL;                 \
    }

#define BeginNew(type, obj)                 \
    {                                       \
        obj = (type*)rmtMalloc(sizeof(type));  \
        if (obj == NULL)                    \
        {                                   \
            error = RMT_ERROR_MALLOC_FAIL;  \
        }                                   \
        else                                \
        {                                   \


#define EndNew(type, obj)                   \
            if (error != RMT_ERROR_NONE)    \
                Delete(type, obj);          \
        }                                   \
    }


// Specialisations for New with varying constructor parameter counts
#define New_0(type, obj)    \
    BeginNew(type, obj); error = type##_Constructor(obj); EndNew(type, obj)
#define New_1(type, obj, a0)    \
    BeginNew(type, obj); error = type##_Constructor(obj, a0); EndNew(type, obj)
#define New_2(type, obj, a0, a1)    \
    BeginNew(type, obj); error = type##_Constructor(obj, a0, a1); EndNew(type, obj)
#define New_3(type, obj, a0, a1, a2)    \
    BeginNew(type, obj); error = type##_Constructor(obj, a0, a1, a2); EndNew(type, obj)

3.1.2 Deps

外部依賴項

rmtU8 minU8(rmtU8 a, rmtU8 b);
rmtU16 maxU16(rmtU16 a, rmtU16 b);
rmtS64 maxS64(rmtS64 a, rmtS64 b);
void* rmtMalloc( rmtU32 size );
void* rmtRealloc( void* ptr, rmtU32 size);
void rmtFree( void* ptr );

3.1.3 ObjAlloc

可重用物件分配器

//
// All objects that require free-list-backed allocation need to inherit from this type.
// 所有需要自由列表支援分配的物件都需要從此型別繼承。
//
typedef struct ObjectLink_s
{
    struct ObjectLink_s* volatile next;
} ObjectLink;

typedef struct
{
    // Object create/destroy parameters
    // 物件建立/銷燬引數
    rmtU32 object_size;
    ObjConstructor constructor;
    ObjDestructor destructor;

    // Number of objects in the free list
    // 自由列表中的物件數量
    volatile rmtS32 nb_free;

    // Number of objects used by callers
    // 呼叫方使用的物件數
    volatile rmtS32 nb_inuse;

    // Total allocation count
    // 總分配計數
    volatile rmtS32 nb_allocated;

    ObjectLink* first_free;
} ObjectAllocator;

void ObjectLink_Constructor(ObjectLink* link);
rmtError ObjectAllocator_Constructor(ObjectAllocator* allocator, rmtU32 object_size, ObjConstructor constructor, ObjDestructor destructor);
void ObjectAllocator_Destructor(ObjectAllocator* allocator);
void ObjectAllocator_Push(ObjectAllocator* allocator, ObjectLink* start, ObjectLink* end);
ObjectLink* ObjectAllocator_Pop(ObjectAllocator* allocator);
rmtError ObjectAllocator_Alloc(ObjectAllocator* allocator, void** object);
void ObjectAllocator_Free(ObjectAllocator* allocator, void* object);
void ObjectAllocator_FreeRange(ObjectAllocator* allocator, void* start, void* end, rmtU32 count);

3.1.4 Safec

安全C庫摘錄

r_size_t strnlen_s (const char *dest, r_size_t dmax);
errno_t strstr_s (char *dest, r_size_t dmax,
          const char *src, r_size_t slen, char **substring);
errno_t strncat_s (char *dest, r_size_t dmax, const char *src, r_size_t slen);
errno_t strcpy_s(char *dest, r_size_t dmax, const char *src);
void itoahex_s( char *dest, r_size_t dmax, rmtS32 value );

3.1.5 SHA1

SHA-1加密雜湊函式

typedef struct
{
    rmtU8 data[20];
} SHA1;

unsigned int rol(const unsigned int value, const unsigned int steps);
void clearWBuffert(unsigned int* buffert);
void innerHash(unsigned int* result, unsigned int* w);
void calc(const void* src, const int bytelength, unsigned char* hash);
SHA1 SHA1_Calculate(const void* src, unsigned int length);

3.1.6 BASE64

Base-64編碼器

rmtU32 Base64_CalculateEncodedLength(rmtU32 length);
void Base64_Encode(const rmtU8* in_bytes, rmtU32 length, rmtU8* out_bytes);

3.1.7 Murmurhash

Murmur-Hash 3

rmtU32 rotl32(rmtU32 x, rmtS8 r);
rmtU32 getblock32(const rmtU32* p, int i);
rmtU32 fmix32(rmtU32 h);
rmtU32 MurmurHash3_x86_32(const void* key, int len, rmtU32 seed);

3.2 執行緒併發功能類

3.2.1 Tls

執行緒區域性儲存(pthread_key_create/pthread_setspecific)

rmtError tlsAlloc(rmtTLS* handle);
void tlsFree(rmtTLS handle);
void tlsSet(rmtTLS handle, void* value);
void* tlsGet(rmtTLS handle);

3.2.2 Atomic

原子操作

rmtBool AtomicCompareAndSwap(rmtU32 volatile* val, long old_val, long new_val);
rmtBool AtomicCompareAndSwapPointer(long* volatile* ptr, long* old_ptr, long* new_ptr);
rmtS32 AtomicAdd(rmtS32 volatile* value, rmtS32 add);
void AtomicSub(rmtS32 volatile* value, rmtS32 sub);
void CompilerWriteFence();
void CompilerReadFence();
rmtU32 LoadAcquire(rmtU32* volatile address);
long* LoadAcquirePointer(long* volatile* ptr);
void StoreRelease(rmtU32* volatile address, rmtU32 value);
void StoreReleasePointer(long* volatile* ptr, long* value);

3.2.3 Threads

執行緒處理

typedef struct Thread_t rmtThread;
typedef rmtError(*ThreadProc)(rmtThread* thread);

struct Thread_t
{
    // OS-specific data
    // 作業系統特定資料
    #if defined(RMT_PLATFORM_WINDOWS)
        HANDLE handle;
    #else
        pthread_t handle;
    #endif

    // Callback executed when the thread is created
    // 建立執行緒時執行的回撥
    ThreadProc callback;

    // Caller-specified parameter passed to Thread_Create
    // 傳遞給Thread_Create的呼叫方指定引數
    void* param;

    // Error state returned from callback
    // 回撥返回錯誤狀態
    rmtError error;

    // External threads can set this to request an exit
    // 外部執行緒可以將其設定為請求退出
    volatile rmtBool request_exit;

};

int rmtThread_Valid(rmtThread* thread);
rmtError rmtThread_Constructor(rmtThread* thread, ThreadProc callback, void* param);
void rmtThread_RequestExit(rmtThread* thread);
void rmtThread_Join(rmtThread* thread);
void rmtThread_Destructor(rmtThread* thread);

3.2.4 DynBuf

動態緩衝器

typedef struct
{
    rmtU32 alloc_granularity;

    rmtU32 bytes_allocated;
    rmtU32 bytes_used;

    rmtU8* data;
} Buffer;

rmtError Buffer_Constructor(Buffer* buffer, rmtU32 alloc_granularity);
void Buffer_Destructor(Buffer* buffer);
rmtError Buffer_Grow(Buffer* buffer, rmtU32 length);
rmtError Buffer_Write(Buffer* buffer, const void* data, rmtU32 length);
rmtError Buffer_WriteStringZ(Buffer* buffer, rmtPStr string);
void U32ToByteArray(rmtU8* dest, rmtU32 value);
rmtError Buffer_WriteU32(Buffer* buffer, rmtU32 value);
rmtBool IsLittleEndian();
rmtError Buffer_WriteU64(Buffer* buffer, rmtU64 value);
rmtError Buffer_WriteStringWithLength(Buffer* buffer, rmtPStr string);

3.3 網路服務功能類

3.3.1 Sockets

TCP/IP Sockets

typedef struct
{
    SOCKET socket;
} TCPSocket;


typedef struct
{
    rmtBool can_read;
    rmtBool can_write;
    rmtError error_state;
} SocketStatus;

rmtError TCPSocket_Constructor(TCPSocket* tcp_socket);
void TCPSocket_Destructor(TCPSocket* tcp_socket);
rmtError TCPSocket_RunServer(TCPSocket* tcp_socket, rmtU16 port, rmtBool reuse_open_port, rmtBool limit_connections_to_localhost);
void TCPSocket_Close(TCPSocket* tcp_socket);
SocketStatus TCPSocket_PollStatus(TCPSocket* tcp_socket);
rmtError TCPSocket_AcceptConnection(TCPSocket* tcp_socket, TCPSocket** client_socket);
int TCPSocketWouldBlock();
rmtError TCPSocket_Send(TCPSocket* tcp_socket, const void* data, rmtU32 length, rmtU32 timeout_ms);
rmtError TCPSocket_Receive(TCPSocket* tcp_socket, void* data, rmtU32 length, rmtU32 timeout_ms);

3.3.2 WebSockets

WebSockets

enum WebSocketMode
{
    WEBSOCKET_NONE = 0,
    WEBSOCKET_TEXT = 1,
    WEBSOCKET_BINARY = 2,
};


typedef struct
{
    TCPSocket* tcp_socket;

    enum WebSocketMode mode;

    rmtU32 frame_bytes_remaining;
    rmtU32 mask_offset;

    union
    {
        rmtU8 mask[4];
        rmtU32 mask_u32;
    } data;

} WebSocket;

char* GetField(char* buffer, r_size_t buffer_length, rmtPStr field_name);
rmtError WebSocketHandshake(TCPSocket* tcp_socket, rmtPStr limit_host);
rmtError WebSocket_Constructor(WebSocket* web_socket, TCPSocket* tcp_socket);
void WebSocket_Destructor(WebSocket* web_socket);
rmtError WebSocket_RunServer(WebSocket* web_socket, rmtU16 port, rmtBool reuse_open_port, rmtBool limit_connections_to_localhost, enum WebSocketMode mode);
void WebSocket_Close(WebSocket* web_socket);
SocketStatus WebSocket_PollStatus(WebSocket* web_socket);
rmtError WebSocket_AcceptConnection(WebSocket* web_socket, WebSocket** client_socket);
void WriteSize(rmtU32 size, rmtU8* dest, rmtU32 dest_size, rmtU32 dest_offset);
void WebSocket_PrepareBuffer(Buffer* buffer);
rmtU32 WebSocket_FrameHeaderSize(rmtU32 length);
void WebSocket_WriteFrameHeader(WebSocket* web_socket, rmtU8* dest, rmtU32 length);
rmtError WebSocket_Send(WebSocket* web_socket, const void* data, rmtU32 length, rmtU32 timeout_ms);
rmtError ReceiveFrameHeader(WebSocket* web_socket);
rmtError WebSocket_Receive(WebSocket* web_socket, void* data, rmtU32* msg_len, rmtU32 length, rmtU32 timeout_ms);

3.3.3 Network

網路伺服器

typedef rmtError (*Server_ReceiveHandler)(void*, char*, rmtU32);

typedef struct
{
    WebSocket* listen_socket;

    WebSocket* client_socket;

    rmtU32 last_ping_time;

    rmtU16 port;

    rmtBool reuse_open_port;
    rmtBool limit_connections_to_localhost;

    // A dynamically-sized buffer used for binary-encoding messages and sending to the client
    Buffer* bin_buf;

    // Handler for receiving messages from the client
    Server_ReceiveHandler receive_handler;
    void* receive_handler_context;
} Server;

rmtError Server_CreateListenSocket(Server* server, rmtU16 port, rmtBool reuse_open_port, rmtBool limit_connections_to_localhost);
rmtError Server_Constructor(Server* server, rmtU16 port, rmtBool reuse_open_port, rmtBool limit_connections_to_localhost);
void Server_Destructor(Server* server);
rmtBool Server_IsClientConnected(Server* server);
void Server_DisconnectClient(Server* server);
rmtError Server_Send(Server* server, const void* data, rmtU32 length, rmtU32 timeout);
rmtError Server_ReceiveMessage(Server* server, char message_first_byte, rmtU32 message_length);
void Server_Update(Server* server);

3.4 負載率取樣功能類

3.4.1 Timers

特定於平臺的計時器

功能類函式介面:

// 微秒精度高效能計數器
#ifndef RMT_PLATFORM_WINDOWS
    typedef rmtU64 LARGE_INTEGER;
#endif
typedef struct
{
    LARGE_INTEGER counter_start;
    double counter_scale;
} usTimer;

void usTimer_Init(usTimer* timer);
rmtU32 msTimer_Get();       // 獲取ms時間值
rmtU64 usTimer_Get(usTimer* timer);
void msSleep(rmtU32 time_ms);

3.4.2 Sample

基本取樣說明(預設情況下為CPU)

typedef enum SampleType
{
    SampleType_CPU,
    SampleType_CUDA,
    SampleType_D3D11,
    SampleType_OpenGL,
    SampleType_Metal,
    SampleType_Count,
} SampleType;

typedef struct Sample
{
    // Inherit so that samples can be quickly allocated
    ObjectLink Link;

    enum SampleType type;

    // Used to anonymously copy sample data without knowning its type
    rmtU32 size_bytes;

    // Hash generated from sample name
    //根據樣本名稱生成的雜湊
    rmtU32 name_hash;

    // Unique, persistent ID among all samples
    // 所有樣本中唯一、持久的ID
    rmtU32 unique_id;

    // Null-terminated string storing the hash-prefixed 6-digit colour
    rmtU8 unique_id_html_colour[8];

    // Links to related samples in the tree
    struct Sample* parent;
    struct Sample* first_child;
    struct Sample* last_child;
    struct Sample* next_sibling;

    // Keep track of child count to distinguish from repeated calls to the same function at the same stack level
    // This is also mixed with the callstack hash to allow consistent addressing of any point in the tree
    rmtU32 nb_children;

    // Sample end points and length in microseconds
    rmtU64 us_start;
    rmtU64 us_end;
    rmtU64 us_length;

    // Total sampled length of all children
    rmtU64 us_sampled_length;

    // Number of times this sample was used in a call in aggregate mode, 1 otherwise
    rmtU32 call_count;

    // Current and maximum sample recursion depths
    rmtU16 recurse_depth;
    rmtU16 max_recurse_depth;

} Sample;

rmtError Sample_Constructor(Sample* sample);
void Sample_Destructor(Sample* sample);
void Sample_Prepare(Sample* sample, rmtU32 name_hash, Sample* parent);
rmtError bin_Sample(Buffer* buffer, Sample* sample);
rmtError bin_SampleArray(Buffer* buffer, Sample* parent_sample);

3.4.3 SampleTree

帶有分配器的樣本樹

typedef struct SampleTree
{
    // Allocator for all samples
    ObjectAllocator* allocator;

    // Root sample for all samples created by this thread
    Sample* root;

    // Most recently pushed sample
    Sample* current_parent;

} SampleTree;

typedef struct Msg_SampleTree
{
    Sample* root_sample;

    ObjectAllocator* allocator;

    rmtPStr thread_name;
} Msg_SampleTree;

rmtError SampleTree_Constructor(SampleTree* tree, rmtU32 sample_size, ObjConstructor constructor, ObjDestructor destructor);
void SampleTree_Destructor(SampleTree* tree);
rmtU32 HashCombine(rmtU32 hash_a, rmtU32 hash_b);
rmtError SampleTree_Push(SampleTree* tree, rmtU32 name_hash, rmtU32 flags, Sample** sample);
void SampleTree_Pop(SampleTree* tree, Sample* sample);
ObjectLink* FlattenSampleTree(Sample* sample, rmtU32* nb_samples);
void FreeSampleTree(Sample* sample, ObjectAllocator* allocator);

void AddSampleTreeMessage(rmtMessageQueue* queue, Sample* sample, ObjectAllocator* allocator, rmtPStr thread_name, struct ThreadSampler* thread_sampler)

3.4.4 Tsampler

每個執行緒的取樣器

typedef struct ThreadSampler
{
    // Name to assign to the thread in the viewer
    rmtS8 name[256];

    // Store a unique sample tree for each type
    //為每種型別儲存一個唯一的樣本樹
    SampleTree* sample_trees[SampleType_Count];

    // Table of all sample names encountered on this thread
    // 此執行緒上遇到的所有示例名稱表
    StringTable* names;

#if RMT_USE_D3D11
    D3D11* d3d11;
#endif

    // Next in the global list of active thread samplers
    //全域性活動執行緒取樣器列表中的下一個
    struct ThreadSampler* volatile next;

} ThreadSampler;

rmtError ThreadSampler_Constructor(ThreadSampler* thread_sampler);
void ThreadSampler_Destructor(ThreadSampler* ts);
rmtError ThreadSampler_Push(SampleTree* tree, rmtU32 name_hash, rmtU32 flags, Sample** sample);
rmtBool ThreadSampler_Pop(ThreadSampler* ts, rmtMessageQueue* queue, Sample* sample);
rmtU32 ThreadSampler_GetNameHash(ThreadSampler* ts, rmtPStr name, rmtU32* hash_cache);

3.5 訊息佇列類

3.5.1 Vmbuffer

使用虛擬記憶體進行自動換行的映象緩衝區

typedef struct VirtualMirrorBuffer
{
    // Page-rounded size of the buffer without mirroring
    rmtU32 size;

    // Pointer to the first part of the mirror
    // The second part comes directly after at ptr+size bytes
    rmtU8* ptr;

#ifdef RMT_PLATFORM_WINDOWS
    #ifdef _XBOX_ONE
        size_t page_count;
        size_t* page_mapping;
    #else
        HANDLE file_map_handle;
    #endif
#endif

} VirtualMirrorBuffer;

rmtError VirtualMirrorBuffer_Constructor(VirtualMirrorBuffer* buffer, rmtU32 size, int nb_attempts);
void VirtualMirrorBuffer_Destructor(VirtualMirrorBuffer* buffer);

3.5.2 HashTable

用於插入/查詢的整數對雜湊對映。為了增加簡單性,沒有刪除。

typedef struct
{
    // Non-zero, pre-hashed key
    rmtU32 key;

    // Value that's not equal to RMT_NOT_FOUND
    rmtU32 value;
} HashSlot;

typedef struct
{
    // Stats
    rmtU32 max_nb_slots;
    rmtU32 nb_slots;

    // Data
    HashSlot* slots;
} rmtHashTable;

rmtError rmtHashTable_Constructor(rmtHashTable* table, rmtU32 max_nb_slots);
void rmtHashTable_Destructor(rmtHashTable* table);
rmtError rmtHashTable_Insert(rmtHashTable* table, rmtU32 key, rmtU32 value);
rmtError rmtHashTable_Resize(rmtHashTable* table);
rmtU32 rmtHashTable_Find(rmtHashTable* table, rmtU32 key);

3.5.3 StringTable

從字串雜湊對映到本地緩衝區中的字串偏移

typedef struct
{
    // Growable dynamic array of strings added so far
    Buffer* text;

    // Map from text hash to text location in the buffer
    rmtHashTable* text_map;
} StringTable;

rmtError StringTable_Constructor(StringTable* table);
void StringTable_Destructor(StringTable* table);
rmtPStr StringTable_Find(StringTable* table, rmtU32 name_hash);
void StringTable_Insert(StringTable* table, rmtU32 name_hash, rmtPStr name);

3.5.4 Messageq

多生產者、單消費者訊息佇列

typedef enum MessageID
{
    MsgID_NotReady,
    MsgID_LogText,
    MsgID_SampleTree,
    MsgID_None,
    MsgID_Force32Bits = 0xFFFFFFFF,
} MessageID;

typedef struct Message
{
    MessageID id;

    rmtU32 payload_size;

    // For telling which thread the message came from in the debugger
    struct ThreadSampler* thread_sampler;

    rmtU8 payload[1];
} Message;

typedef struct rmtMessageQueue
{
    rmtU32 size;

    // The physical address of this data buffer is pointed to by two sequential
    // virtual memory pages, allowing automatic wrap-around of any reads or writes
    // that exceed the limits of the buffer.
    // 該資料緩衝區的實體地址由兩個順序的指標指向
    // 虛擬記憶體頁,允許自動環繞任何讀取或寫入
    // 超出緩衝區的限制。  
    VirtualMirrorBuffer* data;

    // Read/write position never wrap allowing trivial overflow checks
    // with easier debugging
    //讀/寫位置從不換行,允許進行瑣碎的溢位檢查
    //更容易除錯
    rmtU32 read_pos;
    rmtU32 write_pos;

} rmtMessageQueue;

rmtError rmtMessageQueue_Constructor(rmtMessageQueue* queue, rmtU32 size);
void rmtMessageQueue_Destructor(rmtMessageQueue* queue);
rmtU32 rmtMessageQueue_SizeForPayload(rmtU32 payload_size);
static Message* rmtMessageQueue_AllocMessage(rmtMessageQueue* queue, rmtU32 payload_size, struct ThreadSampler* thread_sampler);
void rmtMessageQueue_CommitMessage(Message* message, MessageID id);
Message* rmtMessageQueue_PeekNextMessage(rmtMessageQueue* queue);
void rmtMessageQueue_ConsumeNextMessage(rmtMessageQueue* queue, Message* message);

3.6 主功能類

3.6.1 Remotry

struct Remotery
{
    Server* server;

    // Microsecond accuracy timer for CPU timestamps
    usTimer timer;

    rmtTLS thread_sampler_tls_handle;

    // Linked list of all known threads being sampled
    ThreadSampler* volatile first_thread_sampler;

    // Queue between clients and main remotery thread
    rmtMessageQueue* mq_to_rmt_thread;

    // The main server thread
    rmtThread* thread;

    // Set to trigger a map of each message on the remotery thread message queue
    void (*map_message_queue_fn)(Remotery* rmt, Message*);
    void* map_message_queue_data;

#if RMT_USE_CUDA
    rmtCUDABind cuda;
#endif

#if RMT_USE_OPENGL
    OpenGL* opengl;
#endif

#if RMT_USE_METAL
    Metal* metal;
#endif
};

void GetSampleDigest(Sample* sample, rmtU32* digest_hash, rmtU32* nb_samples);
rmtError Remotery_SendLogTextMessage(Remotery* rmt, Message* message);
rmtError bin_SampleTree(Buffer* buffer, Msg_SampleTree* msg);
rmtError Remotery_SendSampleTreeMessage(Remotery* rmt, Message* message);
rmtError Remotery_ConsumeMessageQueue(Remotery* rmt);
void Remotery_FlushMessageQueue(Remotery* rmt);
void Remotery_MapMessageQueue(Remotery* rmt);
rmtError Remotery_ThreadMain(rmtThread* thread);
rmtError Remotery_ReceiveMessage(void* context, char* message_data, rmtU32 message_length);
rmtError Remotery_Constructor(Remotery* rmt);
void Remotery_Destructor(Remotery* rmt);
rmtError Remotery_GetThreadSampler(Remotery* rmt, ThreadSampler** thread_sampler);C
void Remotery_DestroyThreadSamplers(Remotery* rmt);
void* CRTMalloc(void* mm_context, rmtU32 size);
void CRTFree(void* mm_context, void* ptr);
void* CRTRealloc(void* mm_context, void* ptr, rmtU32 size);

RMI API進行了標頭檔案宣告,用於Remotery做為三方庫時,API被其他庫呼叫:

RMT_API rmtSettings* _rmt_Settings( void );
RMT_API enum rmtError _rmt_CreateGlobalInstance(Remotery** remotery);
RMT_API void _rmt_DestroyGlobalInstance(Remotery* remotery);
RMT_API void _rmt_SetGlobalInstance(Remotery* remotery);
RMT_API Remotery* _rmt_GetGlobalInstance(void);
RMT_API void _rmt_SetCurrentThreadName(rmtPStr thread_name);
RMT_API void _rmt_LogText(rmtPStr text);
RMT_API void _rmt_BeginCPUSample(rmtPStr name, rmtU32 flags, rmtU32* hash_cache);
RMT_API void _rmt_EndCPUSample(void);

3.6.2 CUDA

CUDA事件負載率取樣

typedef struct CUDASample
{
    // IS-A inheritance relationship
    Sample base;

    // Pair of events that wrap the sample
    CUevent event_start;
    CUevent event_end;

} CUDASample;

rmtError MapCUDAResult(CUresult result);
rmtError CUDASetContext(void* context);
rmtError CUDAGetContext(void** context);
rmtError CUDAEnsureContext();
rmtError CUDAEventCreate(CUevent* phEvent, unsigned int Flags);
rmtError CUDAEventDestroy(CUevent hEvent);
rmtError CUDAEventRecord(CUevent hEvent, void* hStream);
rmtError CUDAEventQuery(CUevent hEvent);
rmtError CUDAEventElapsedTime(float* pMilliseconds, CUevent hStart, CUevent hEnd);
rmtError CUDASample_Constructor(CUDASample* sample);
void CUDASample_Destructor(CUDASample* sample);
rmtBool AreCUDASamplesReady(Sample* sample);
rmtBool GetCUDASampleTimes(Sample* root_sample, Sample* sample);

RMT_API void _rmt_BindCUDA(const rmtCUDABind* bind);
RMT_API void _rmt_BeginCUDASample(rmtPStr name, rmtU32* hash_cache, void* stream);
RMT_API void _rmt_EndCUDASample(void* stream);

3.6.3 D3D11

Direct3D 11事件取樣

3.6.4 OPENGL

OpenGL事件取樣

3.6.5 METAL

metal事件取樣

4.Remotery主體設計

主函式主流程:

rmtError Remotery_ThreadMain(rmtThread* thread)

image

網路功能類繼承關係:

image

如何計算的每個函式的CPU負載呢?

初步分析,Remotey是透過在執行一個執行緒前將這個執行緒/函式名(name)加入一個hash表中(_rmt_BeginCPUSample(rmtPStr name, rmtU32 flags, rmtU32* hash_cache)),然後獲取當前的時間,執行此執行緒/函式,再呼叫_rmt_EndCPUSample()再獲取一個時間,這樣就算出了一個函式的執行時間。作為計算負載的依據。










開源地址:
https://gitee.com/stlstl/Remotery.git

相關文章