blob: 5299e3d238149c313d20a7a165febd4d36c46c04 [file]
//===--- amdgpu/impl/impl.cpp ------------------------------------- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#include "rt.h"
#include <memory>
/*
* Data
*/
hsa_status_t is_locked(void *ptr, void **agentBaseAddress) {
hsa_status_t err = HSA_STATUS_SUCCESS;
hsa_amd_pointer_info_t info;
info.size = sizeof(hsa_amd_pointer_info_t);
err = hsa_amd_pointer_info(ptr, &info, /*alloc=*/nullptr,
/*num_agents_accessible=*/nullptr,
/*accessible=*/nullptr);
if (err != HSA_STATUS_SUCCESS) {
DP("Error when getting pointer info\n");
return err;
}
if (info.type == HSA_EXT_POINTER_TYPE_LOCKED) {
// When user passes in a basePtr+offset we need to fix the
// locked pointer to include the offset: ROCr always returns
// the base locked address, not the shifted one.
if ((char *)info.hostBaseAddress <= (char *)ptr &&
(char *)ptr < (char *)info.hostBaseAddress + info.sizeInBytes)
*agentBaseAddress =
(void *)((uint64_t)info.agentBaseAddress + (uint64_t)ptr -
(uint64_t)info.hostBaseAddress);
else // address is already device-agent accessible, no need to compute
// offset
*agentBaseAddress = ptr;
} else
*agentBaseAddress = nullptr;
return HSA_STATUS_SUCCESS;
}
// host pointer (either src or dest) must be locked via hsa_amd_memory_lock
static hsa_status_t invoke_hsa_copy(hsa_signal_t signal, void *dest,
hsa_agent_t agent, const void *src,
size_t size) {
const hsa_signal_value_t init = 1;
const hsa_signal_value_t success = 0;
hsa_signal_store_screlease(signal, init);
hsa_status_t err = hsa_amd_memory_async_copy(dest, agent, src, agent, size, 0,
nullptr, signal);
if (err != HSA_STATUS_SUCCESS)
return err;
// async_copy reports success by decrementing and failure by setting to < 0
hsa_signal_value_t got = init;
while (got == init)
got = hsa_signal_wait_scacquire(signal, HSA_SIGNAL_CONDITION_NE, init,
UINT64_MAX, HSA_WAIT_STATE_BLOCKED);
if (got != success)
return HSA_STATUS_ERROR;
return err;
}
struct implFreePtrDeletor {
void operator()(void *p) {
core::Runtime::Memfree(p); // ignore failure to free
}
};
enum CopyDirection { H2D, D2H };
static hsa_status_t locking_async_memcpy(enum CopyDirection direction,
hsa_signal_t signal, void *dest,
hsa_agent_t agent, void *src,
void *lockingPtr, size_t size) {
void *lockedPtr = nullptr;
hsa_status_t err = is_locked(lockingPtr, &lockedPtr);
bool HostPtrIsLocked = true;
if (err != HSA_STATUS_SUCCESS)
return err;
if (!lockedPtr) { // not locked
HostPtrIsLocked = false;
hsa_agent_t agents[1] = {agent};
err = hsa_amd_memory_lock(lockingPtr, size, agents, /*num_agent=*/1,
(void **)&lockedPtr);
if (err != HSA_STATUS_SUCCESS)
return err;
DP("locking_async_memcpy: lockingPtr=%p lockedPtr=%p Size = %lu\n",
lockingPtr, lockedPtr, size);
}
switch (direction) {
case H2D:
err = invoke_hsa_copy(signal, dest, agent, lockedPtr, size);
break;
case D2H:
err = invoke_hsa_copy(signal, lockedPtr, agent, src, size);
break;
}
if (err != HSA_STATUS_SUCCESS && !HostPtrIsLocked) {
// do not leak locked host pointers, but discard potential error message
// because the initial error was in the copy function
hsa_amd_memory_unlock(lockingPtr);
return err;
}
// unlock only if not user locked
if (!HostPtrIsLocked)
err = hsa_amd_memory_unlock(lockingPtr);
if (err != HSA_STATUS_SUCCESS)
return err;
return HSA_STATUS_SUCCESS;
}
hsa_status_t impl_memcpy_h2d(hsa_signal_t signal, void *deviceDest,
void *hostSrc, size_t size,
hsa_agent_t device_agent,
hsa_amd_memory_pool_t MemoryPool) {
hsa_status_t err;
err = locking_async_memcpy(CopyDirection::H2D, signal, deviceDest,
device_agent, hostSrc, hostSrc, size);
if (err == HSA_STATUS_SUCCESS)
return err;
// async memcpy sometimes fails in situations where
// allocate + copy succeeds. Looks like it might be related to
// locking part of a read only segment. Fall back for now.
void *tempHostPtr;
hsa_status_t ret = core::Runtime::HostMalloc(&tempHostPtr, size, MemoryPool);
if (ret != HSA_STATUS_SUCCESS) {
DP("HostMalloc: Unable to alloc %zu bytes for temp scratch\n", size);
return ret;
}
std::unique_ptr<void, implFreePtrDeletor> del(tempHostPtr);
memcpy(tempHostPtr, hostSrc, size);
return locking_async_memcpy(CopyDirection::H2D, signal, deviceDest,
device_agent, tempHostPtr, tempHostPtr, size);
}
hsa_status_t impl_memcpy_d2h(hsa_signal_t signal, void *hostDest,
void *deviceSrc, size_t size,
hsa_agent_t deviceAgent,
hsa_amd_memory_pool_t MemoryPool) {
hsa_status_t err;
// device has always visibility over both pointers, so use that
err = locking_async_memcpy(CopyDirection::D2H, signal, hostDest, deviceAgent,
deviceSrc, hostDest, size);
if (err == HSA_STATUS_SUCCESS)
return err;
// hsa_memory_copy sometimes fails in situations where
// allocate + copy succeeds. Looks like it might be related to
// locking part of a read only segment. Fall back for now.
void *tempHostPtr;
hsa_status_t ret = core::Runtime::HostMalloc(&tempHostPtr, size, MemoryPool);
if (ret != HSA_STATUS_SUCCESS) {
DP("HostMalloc: Unable to alloc %zu bytes for temp scratch\n", size);
return ret;
}
std::unique_ptr<void, implFreePtrDeletor> del(tempHostPtr);
err = locking_async_memcpy(CopyDirection::D2H, signal, tempHostPtr,
deviceAgent, deviceSrc, tempHostPtr, size);
if (err != HSA_STATUS_SUCCESS)
return HSA_STATUS_ERROR;
memcpy(hostDest, tempHostPtr, size);
return HSA_STATUS_SUCCESS;
}