AI Infra Engineering: Abstraction

底层硬件抽象：设备、运行时、存储

`Device` 抽象层：向上提供统一的硬件接口

将 Device/Runtime 抽象是 AI Infra 工程里非常常见的步骤，因为通常来说，我们的 AI Infra 需要支持不同的后端，如 Intel CPU, AMD CPU, NVIDIA, Ascend, 摩尔线程等等不同 GPU/CPU．而在不同的计算后端上，执行相同的计算可能需要不同的 API 接口，也会有不同的优化策略，所以我们需要将 Device/Runtime 抽象出来．

一般来说，Device 最重要的是 API Design，比如说针对内存数据复制设计统一的 API 格式．例如，对于 InfiniTensor 的 llaisys 大模型推理项目中，其 API 设计包含

llaisys 对 Device 进行抽象并提供一组 API

__C {
    // Runtime API Functions
    // Device
    typedef int (*get_device_count_api)();
    typedef void (*set_device_api)(int);
    typedef void (*device_synchronize_api)();
    // Stream
    typedef llaisysStream_t (*create_stream_api)();
    typedef void (*destroy_stream_api)(llaisysStream_t);
    typedef void (*stream_synchronize_api)(llaisysStream_t);
    // Memory
    typedef void *(*malloc_device_api)(size_t);
    typedef void (*free_device_api)(void *);
    typedef void *(*malloc_host_api)(size_t);
    typedef void (*free_host_api)(void *);
    // Memory copy
    typedef void (*memcpy_sync_api)(
        void *,
        const void *,
        size_t,
        llaisysMemcpyKind_t);
    typedef void (*memcpy_async_api)(
        void *,
        const void *,
        size_t,
        llaisysMemcpyKind_t,
        llaisysStream_t);

    struct LlaisysRuntimeAPI {
        get_device_count_api get_device_count;
        set_device_api set_device;
        device_synchronize_api device_synchronize;
        create_stream_api create_stream;
        destroy_stream_api destroy_stream;
        stream_synchronize_api stream_synchronize;
        malloc_device_api malloc_device;
        free_device_api free_device;
        malloc_host_api malloc_host;
        free_host_api free_host;
        memcpy_sync_api memcpy_sync;
        memcpy_async_api memcpy_async;
    };

    // Llaisys API for getting the runtime APIs
    __export const LlaisysRuntimeAPI *llaisysGetRuntimeAPI(llaisysDeviceType_t);

    // Llaisys API for switching device context
    __export void llaisysSetContextRuntime(llaisysDeviceType_t, int);
}

`RuntimeAPI` 抽象层：负责硬件的资源分配

Device 主要是框架对不同计算后端做的抽象，而 RuntimeAPI 则可以理解为在具体的平台上，实现 Device 接口，从而在计算平台上实现功能．例如，在 llaisys 框架下，是这样实现 CPU Backend 的：

CPU Backend 对接口的实现

#include "../runtime_api.hpp"

#include <cstdlib>
#include <cstring>

namespace llaisys::device::cpu {

namespace runtime_api {

int getDeviceCount() {
    return 1;
}

void setDevice(int) {
    // do nothing
}

void deviceSynchronize() {
    // do nothing
}

llaisysStream_t createStream() {
    return (llaisysStream_t)0; // null stream
}

void destroyStream(llaisysStream_t stream) {
    // do nothing
}
void streamSynchronize(llaisysStream_t stream) {
    // do nothing
}

void *mallocDevice(size_t size) {
    return std::malloc(size);
}

void freeDevice(void *ptr) {
    std::free(ptr);
}

void *mallocHost(size_t size) {
    return mallocDevice(size);
}

void freeHost(void *ptr) {
    freeDevice(ptr);
}

void memcpySync(void *dst, const void *src, size_t size, llaisysMemcpyKind_t kind) {
    std::memcpy(dst, src, size);
}

void memcpyAsync(void *dst, const void *src, size_t size, llaisysMemcpyKind_t kind, llaisysStream_t stream) {
    memcpySync(dst, src, size, kind);
}

static const LlaisysRuntimeAPI RUNTIME_API = {
    &getDeviceCount,
    &setDevice,
    &deviceSynchronize,
    &createStream,
    &destroyStream,
    &streamSynchronize,
    &mallocDevice,
    &freeDevice,
    &mallocHost,
    &freeHost,
    &memcpySync,
    &memcpyAsync};

} // namespace runtime_api

const LlaisysRuntimeAPI *getRuntimeAPI() {
    return &runtime_api::RUNTIME_API;
}

} // namespace llaisys::device::cpu

CUDA Backend 的实现

#include "../runtime_api.hpp"

#include <cstdlib>
#include <cstring>
#include <cuda_runtime.h>

namespace llaisys::device::nvidia {

namespace runtime_api {
int getDeviceCount() {
    int count;
    cudaGetDeviceCount(&count);
    return count;
}

void setDevice(int device_id) {
    cudaSetDevice(device_id);
}

void deviceSynchronize() {
    cudaDeviceSynchronize();
}

llaisysStream_t createStream() {
    cudaStream_t stream;
    cudaStreamCreate(&stream);
    return reinterpret_cast<llaisysStream_t>(stream);
}

void destroyStream(llaisysStream_t stream) {
    cudaStreamDestroy(reinterpret_cast<cudaStream_t>(stream));
}
void streamSynchronize(llaisysStream_t stream) {
    cudaStreamSynchronize(reinterpret_cast<cudaStream_t>(stream));
}

void *mallocDevice(size_t size) {
    void *ptr;
    cudaMalloc(&ptr, size);
    return ptr;
}

void freeDevice(void *ptr) {
    cudaFree(ptr);
}

void *mallocHost(size_t size) {
    void *ptr;
    cudaMallocHost(&ptr, size);
    return ptr;
}

void freeHost(void *ptr) {
    cudaFreeHost(ptr);
}

void memcpySync(void *dst, const void *src, size_t size, llaisysMemcpyKind_t kind) {
    cudaMemcpy(dst, src, size, static_cast<cudaMemcpyKind>(kind));
}

void memcpyAsync(void *dst, const void *src, size_t size, llaisysMemcpyKind_t kind, llaisysStream_t stream) {
    cudaMemcpyAsync(dst, src, size, static_cast<cudaMemcpyKind>(kind), reinterpret_cast<cudaStream_t>(stream));
}

static const LlaisysRuntimeAPI RUNTIME_API = {
    &getDeviceCount,
    &setDevice,
    &deviceSynchronize,
    &createStream,
    &destroyStream,
    &streamSynchronize,
    &mallocDevice,
    &freeDevice,
    &mallocHost,
    &freeHost,
    &memcpySync,
    &memcpyAsync};

} // namespace runtime_api

const LlaisysRuntimeAPI *getRuntimeAPI() {
    return &runtime_api::RUNTIME_API;
}
} // namespace llaisys::device::nvidia

然后通过 switch case 和一个 Unified API 来根据 device 分发 RuntimeAPI

使用统一的函数根据不同的 backend 分发 RuntimeAPI

const LlaisysRuntimeAPI *getRuntimeAPI(llaisysDeviceType_t device_type) {
    // Implement for all device types
    switch (device_type) {
    case LLAISYS_DEVICE_CPU:
        return llaisys::device::cpu::getRuntimeAPI();
    case LLAISYS_DEVICE_NVIDIA:
#ifdef ENABLE_NVIDIA_API
        return llaisys::device::nvidia::getRuntimeAPI();
#else
        return getUnsupportedRuntimeAPI();
#endif
    default:
        EXCEPTION_UNSUPPORTED_DEVICE;
        return nullptr;
    }
}

`MemoryAllocator` Abstraction

MemoryAllocator 依赖于 RuntimeAPI，根据计算后端，调整内存分配策略．因为需要根据不同计算后端做调整，所以设计为 virtual，只需要替换为不同的 RuntimeAPI 即可．

class MemoryAllocator {
protected:
    const LlaisysRuntimeAPI *_api;
    MemoryAllocator(const LlaisysRuntimeAPI *runtime_api);

public:
    virtual ~MemoryAllocator() = default;
    virtual std::byte *allocate(size_t size) = 0;
    virtual void release(std::byte *memory) = 0;
};

例如，llaisys 里实现了一个 NaiveAllocator

llaisys 中 NaiveAllocator 的实现

std::byte *allocate(size_t size) override {
    return static_cast<std::byte *>(_api->malloc_device(size));
}

void release(std::byte *memory) override {
    _api->free_device(memory);
}

`Runtime` Abstraction

每个 Runtime 实例代表一个硬件“节点”，封装了如下

设备类型/编号。
选定的 RuntimeAPI
一个 MemoryAllocator*
一个在创建时由 API 生成的 llaisysStream_t (通常用于 GPU 异步 kernel issue，目前可以忽略)
激活状态标志 _is_active

Runtime::Runtime(llaisysDeviceType_t device_type, int device_id)
    : _device_type(device_type), _device_id(device_id), _is_active(false) {
    _api = llaisys::device::getRuntimeAPI(_device_type);
    _stream = _api->create_stream();
    _allocator = new allocators::NaiveAllocator(_api);
}

// allocate storage
storage_t Runtime::allocateDeviceStorage(size_t size) {
    return std::shared_ptr<Storage>(new Storage(
        _allocator->allocate(size),
        size,
        *this,
        false));
}

storage_t Runtime::allocateHostStorage(size_t size) {
    return std::shared_ptr<Storage>(new Storage(
        (std::byte *)_api->malloc_host(size),
        size,
        *this,
        true));
}

`Storage` Abstraction

Storage 依赖于 Runtime，代表一块具体内存，构造仅由 Runtime.allocateDeviceStorage(), Runtime.allocateHostStorage() 调用，构造时，会带着必要信息．保存原始指针、大小、所属运行时与是否为主机内存，析构时调用 Runtime 释放．

namespace llaisys::core {

class Storage {
private:
    std::byte *_memory;
    size_t _size;
    Runtime &_runtime;
    bool _is_host;
    Storage(std::byte *memory, size_t size, Runtime &runtime, bool is_host);

public:
    friend class Runtime;
    ~Storage();

    std::byte *memory() const;
    size_t size() const;
    llaisysDeviceType_t deviceType() const;
    int deviceId() const;
    bool isHost() const;
};

}; // namespace llaisys::core

`Context` 抽象层：在 `Runtime` 之上分配资源给线程

core::Context() 会返回 thread_local context，这个 context 可以理解为在当前 runtime (device) 上，可以分配给这个运行中线程的资源，保证不同线程可以独立设置设备，不互相影响．

在 llaisys 中，其内部用 unordered_map 维护所有可用 runtime 并在 .setDevice() 中进行激活．于是我们就可以用 core::Context().runtime() 获取当前线程中的 runtime 以便和硬件进行交互．

一般来说，流程如下：

// 1. 创建/设置 runtime/device
// Context 从 map 取/创建 Runtime，并在内部调用 _api->setDevice(0)
core::context().setDevice(LLAISYS_DEVICE_CPU, 0);

// 2. 分配张量存储
auto buf = runtime.allocateDeviceStorage(sz);

// 3. 复制数据
runtime.api()->memcpySync(dst, src, bytes, LLAISYS_MEMCPY_HOST_TO_DEVICE);

// 4. 流与同步
auto s = runtime.createStream();
runtime.memcpyAsync(dst, src, bytes, kind, s);
runtime.streamSynchronize(s);

Conclusion

graph TB;
A[Device] --> B([RuntimeAPI])
B --> C[Memory Allocator]
C --> D[Runtime] --> E[[Storage]]
D --> F[Context]

中层数据抽象

`Tensor` Abstraction

Tensor 是整个库最核心的数据结构之一，它把形状/类型元信息与跨设备内存联系起来，同时提供了一组轻量的元变换和数据移动方法．

TensorMeta 保存数据类型、维度和步幅。
tensor_t 是 shared_ptr，用于引用计数和自动释放。
_storage 是一个 std::shared_ptr<core::Storage>，包含底层内存、设备信息等。
_offset 支持通过切片共享部分缓冲区。

Tensor 数据结构的状态设计

class Tensor;
using tensor_t = std::shared_ptr<Tensor>;

struct TensorMeta {
    llaisysDataType_t dtype;
    std::vector<size_t> shape;
    std::vector<ptrdiff_t> strides;
};

// Tensor: private member
TensorMeta _meta;
core::storage_t _storage;
size_t _offset;

我们先来看一看和计算后端相关的存储．主要有这几个需要关注：

.load() 通过 runtime().api()->memcpy_sync() 从主机内存复制数据到 tensor 所在设备 (Host2Device)
.contiguous() 若数据不连续，则创建新张量并拷贝数据
.to(device) 在 device 上创建新张量，并拷贝数据

AI Infra Engineering: Abstraction

底层硬件抽象：设备、运行时、存储

Device 抽象层：向上提供统一的硬件接口

RuntimeAPI 抽象层：负责硬件的资源分配

MemoryAllocator Abstraction

Runtime Abstraction

Storage Abstraction

Context 抽象层：在 Runtime 之上分配资源给线程