Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 51 additions & 0 deletions src/cuda/allocator.cc
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@
#define cub hipcub
#define cudaGetDevice hipGetDevice
#define cudaSetDevice hipSetDevice
#define cudaFree hipFree
#define cudaMalloc hipMalloc
#define cudaFreeAsync hipFreeAsync
#define cudaMallocAsync hipMallocAsync
#define cudaDeviceGetAttribute hipDeviceGetAttribute
Expand Down Expand Up @@ -76,6 +78,45 @@ namespace ctranslate2 {
std::unique_ptr<cub::CachingDeviceAllocator> _allocator;
};

// Direct cudaMalloc/cudaFree allocator — no caching, no event tracking.
//
// Exists primarily as an opt-in escape hatch for the deadlock on
// ROCm 7.2.1 / Windows reported in issue #2038: there, the hipcub
// CachingDeviceAllocator's per-block hipEventRecord / hipFree calls
// can hang inside the runtime during `del model` cleanup. Falling
// back to a stateless allocator (no cached blocks, no ready events,
// no per-block streams) sidesteps every code path that the upstream
// bug touches. Costs a fresh hipMalloc per allocation — fine for
// workloads that mostly run inference and rarely allocate.
//
// Enabled with `CT2_CUDA_ALLOCATOR=simple` (or `none`).
class SimpleAllocator : public Allocator {
public:
void* allocate(size_t size, int device_index) override {
int prev_device_index = -1;
if (device_index >= 0) {
CUDA_CHECK(cudaGetDevice(&prev_device_index));
CUDA_CHECK(cudaSetDevice(device_index));
}
void* ptr = nullptr;
CUDA_CHECK(cudaMalloc(&ptr, size));
if (prev_device_index >= 0)
CUDA_CHECK(cudaSetDevice(prev_device_index));
return ptr;
}

void free(void* ptr, int device_index) override {
int prev_device_index = -1;
if (device_index >= 0) {
CUDA_CHECK(cudaGetDevice(&prev_device_index));
CUDA_CHECK(cudaSetDevice(device_index));
}
CUDA_CHECK(cudaFree(ptr));
if (prev_device_index >= 0)
CUDA_CHECK(cudaSetDevice(prev_device_index));
}
};

class CudaAsyncAllocator : public Allocator {
public:
void* allocate(size_t size, int device_index) override {
Expand Down Expand Up @@ -139,6 +180,7 @@ namespace ctranslate2 {
enum class CudaAllocator {
CubCaching,
MallocAsync,
Simple,
};

static CudaAllocator resolve_cuda_allocator() {
Expand All @@ -156,6 +198,10 @@ namespace ctranslate2 {
if (!cuda_malloc_async_is_supported)
throw std::runtime_error("The asynchronous CUDA allocator requires CUDA >= 11.2");
allocator = CudaAllocator::MallocAsync;
} else if (allocator_name == "simple" || allocator_name == "none") {
// Stateless cudaMalloc/cudaFree — opt-in workaround for issue
// #2038 (HIP allocator free path deadlocks on ROCm 7.2.1 / Windows).
allocator = CudaAllocator::Simple;
} else {
throw std::invalid_argument("Invalid CUDA allocator " + allocator_name);
}
Expand All @@ -180,6 +226,11 @@ namespace ctranslate2 {
return allocator;
}

if (cuda_allocator == cuda::CudaAllocator::Simple) {
static cuda::SimpleAllocator allocator;
return allocator;
}

static cuda::CudaAsyncAllocator allocator;
return allocator;
}
Expand Down
Loading