| //===--- acxxel.h - The Acxxel API ------------------------------*- C++ -*-===// |
| // |
| // The LLVM Compiler Infrastructure |
| // |
| // This file is distributed under the University of Illinois Open Source |
| // License. See LICENSE.TXT for details. |
| // |
| //===----------------------------------------------------------------------===// |
| |
| /// \mainpage Welcome to Acxxel |
| /// |
| /// \section Introduction |
| /// |
| /// \b Acxxel is a library providing a modern C++ interface for managing |
| /// accelerator devices such as GPUs. Acxxel handles operations such as |
| /// allocating device memory, copying data to and from device memory, creating |
| /// and managing device events, and creating and managing device streams. |
| /// |
| /// \subsection ExampleUsage Example Usage |
| /// |
| /// Below is some example code to show you the basics of Acxxel. |
| /// |
| /// \snippet examples/simple_example.cu Example simple saxpy |
| /// |
| /// The above code could be compiled with either `clang` or `nvcc`. Compare this |
| /// with the standard CUDA runtime library code to perform these same |
| /// operations: |
| /// |
| /// \snippet examples/simple_example.cu Example CUDA simple saxpy |
| /// |
| /// Notice that the CUDA runtime calls are not type safe. For example, if you |
| /// change the type of the inputs from `float` to `double`, you have to remember |
| /// to change the size calculation. If you forget, you will get garbage output |
| /// data. In the Acxxel example, you would instead get a helpful compile-time |
| /// error that wouldn't let you forget to change the types inside the function. |
| /// |
| /// The Acxxel example also automatically uses the right sizes for memory |
| /// copies, so you don't have to worry about computing the sizes yourself. |
| /// |
| /// The CUDA runtime interface makes it easy to get the source and destination |
| /// mixed up in a call to `cudaMemcpy`. If you pass the pointers in the wrong |
| /// order or pass the wrong enum value for the direction parameter, you won't |
| /// find out until runtime (if you remembered to check the error return value of |
| /// `cudaMemcpy`). In Acxxel there is no verbose direction enum because the name |
| /// of the function says which way the copy goes, and mixing up the order of |
| /// source and destination is a compile-time error. |
| /// |
| /// The CUDA runtime interface makes you clean up your device memory by calling |
| /// `cudaFree` for each call to `cudaMalloc`. In Acxxel, you don't have to worry |
| /// about that because the memory cleans itself up when it goes out of scope. |
| /// |
| /// \subsection AcxxelFeatures Acxxel Features |
| /// |
| /// Acxxel provides many nice features compared to the C-like interfaces, such |
| /// as the CUDA runtime API, which are normally used for the host code in |
| /// applications using accelerators. |
| /// |
| /// \subsubsection TypeSafety Type safety |
| /// |
| /// Most errors involving mixing up types, sources and destinations, or host and |
| /// device memory result in helpful compile-time errors. |
| /// |
| /// \subsubsection NoCopySizes No need to specify sizes for memory copies |
| /// |
| /// When the arguments to copy functions such as acxxel::Platform::copyHToD know |
| /// their sizes (e.g std::array, std::vector, and C-style arrays), there is no |
| /// need to specify the amount of memory to copy; Acxxel will just copy the |
| /// whole thing. Of course the copy functions also have overloads that accept an |
| /// element count for those times when you don't want to copy everything. |
| /// |
| /// \subsubsection MemoryCleanup Automatic memory cleanup |
| /// |
| /// Device memory allocated with acxxel::Platform::mallocD is automatically |
| /// freed when it goes out of scope. |
| /// |
| /// \subsubsection NiceErrorHandling Error handling |
| /// |
| /// Operations that would normally return values return acxxel::Expected obects |
| /// in Acxxel. These `Expected` objects contain either a value or an error |
| /// message explaining why the value is not present. This reminds the user to |
| /// check for errors, but also allows them to opt-out easily be calling the |
| /// acxxel::Expected::getValue or acxxel::Expected::takeValue methods. The |
| /// `getValue` method returns a reference to the value, leaving the `Expected` |
| /// instance as the value owner, whereas the `takeValue` method moves the value |
| /// out of the `Expected` object and transfers ownership to the caller. |
| /// |
| /// \subsubsection PlatformIndependence Platform independence |
| /// |
| /// Acxxel code works not only with CUDA, but also with any other platform that |
| /// can support its interface. For example, Acxxel supports OpenCL. The |
| /// acxxel::getCUDAPlatform and acxxel::getOpenCLPlatform functions are provided |
| /// to allow easy access to the built-in CUDA and OpenCL platforms. Other |
| /// platforms can be created by implementing the acxxel::Platform interface, and |
| /// instances of those classes can be created directly. |
| /// |
| /// \subsubsection CUDAInterop Seamless interoperation with CUDA |
| /// |
| /// Acxxel functions as a modern replacement for the standard CUDA runtime |
| /// library and interoperates seamlessly with kernel calls. |
| |
| #ifndef ACXXEL_ACXXEL_H |
| #define ACXXEL_ACXXEL_H |
| |
| #include "span.h" |
| #include "status.h" |
| |
| #include <functional> |
| #include <memory> |
| #include <string> |
| #include <type_traits> |
| |
| #if defined(__clang__) || defined(__GNUC__) |
| #define ACXXEL_WARN_UNUSED_RESULT __attribute__((warn_unused_result)) |
| #else |
| #define ACXXEL_WARN_UNUSED_RESULT |
| #endif |
| |
| /// This type is declared here to provide smooth interoperability with the CUDA |
| /// triple-chevron kernel launch syntax. |
| /// |
| /// A acxxel::Stream instance will be implicitly convertible to a CUstream_st*, |
| /// which is the type expected for the stream argument in the triple-chevron |
| /// CUDA kernel launch. This means that a acxxel::Stream can be passed without |
| /// explicit casting as the fourth argument to a triple-chevron CUDA kernel |
| /// launch. |
| struct CUstream_st; // NOLINT |
| |
| namespace acxxel { |
| |
| class Event; |
| class Platform; |
| class Stream; |
| |
| template <typename T> class DeviceMemory; |
| |
| template <typename T> class DeviceMemorySpan; |
| |
| template <typename T> class AsyncHostMemory; |
| |
| template <typename T> class AsyncHostMemorySpan; |
| |
| template <typename T> class OwnedAsyncHostMemory; |
| |
| /// Function type used to destroy opaque handles given out by the platform. |
| using HandleDestructor = void (*)(void *); |
| |
| /// Functor type for enqueuing host callbacks on a stream. |
| using StreamCallback = std::function<void(Stream &, const Status &)>; |
| |
| struct KernelLaunchDimensions { |
| // Intentionally implicit |
| KernelLaunchDimensions(unsigned int BlockX = 1, unsigned int BlockY = 1, |
| unsigned int BlockZ = 1, unsigned int GridX = 1, |
| unsigned int GridY = 1, unsigned int GridZ = 1) |
| : BlockX(BlockX), BlockY(BlockY), BlockZ(BlockZ), GridX(GridX), |
| GridY(GridY), GridZ(GridZ) {} |
| |
| unsigned int BlockX; |
| unsigned int BlockY; |
| unsigned int BlockZ; |
| unsigned int GridX; |
| unsigned int GridY; |
| unsigned int GridZ; |
| }; |
| |
| /// Logs a warning message. |
| void logWarning(const std::string &Message); |
| |
| /// Gets a pointer to the standard CUDA platform. |
| Expected<Platform *> getCUDAPlatform(); |
| |
| /// Gets a pointer to the standard OpenCL platform. |
| Expected<Platform *> getOpenCLPlatform(); |
| |
| /// A function that can be executed on the device. |
| /// |
| /// A Kernel is created from a Program by calling Program::createKernel, and a |
| /// kernel is enqueued into a Stream by calling Stream::asyncKernelLaunch. |
| class Kernel { |
| public: |
| Kernel(const Kernel &) = delete; |
| Kernel &operator=(const Kernel &) = delete; |
| Kernel(Kernel &&) noexcept; |
| Kernel &operator=(Kernel &&That) noexcept; |
| ~Kernel() = default; |
| |
| private: |
| // Only a Program can make a kernel. |
| friend class Program; |
| Kernel(Platform *APlatform, void *AHandle, HandleDestructor Destructor) |
| : ThePlatform(APlatform), TheHandle(AHandle, Destructor) {} |
| |
| // Let stream get raw handle for kernel launches. |
| friend class Stream; |
| |
| Platform *ThePlatform; |
| std::unique_ptr<void, HandleDestructor> TheHandle; |
| }; |
| |
| /// A program loaded on a device. |
| /// |
| /// A program can be created by calling Platform::createProgramFromSource, and a |
| /// Kernel can be created from a program by running Program::createKernel. |
| /// |
| /// A program can contain any number of kernels, and a program only needs to be |
| /// loaded once in order to use all its kernels. |
| class Program { |
| public: |
| Program(const Program &) = delete; |
| Program &operator=(const Program &) = delete; |
| Program(Program &&) noexcept; |
| Program &operator=(Program &&That) noexcept; |
| ~Program() = default; |
| |
| Expected<Kernel> createKernel(const std::string &Name); |
| |
| private: |
| // Only a platform can make a program. |
| friend class Platform; |
| Program(Platform *APlatform, void *AHandle, HandleDestructor Destructor) |
| : ThePlatform(APlatform), TheHandle(AHandle, Destructor) {} |
| |
| Platform *ThePlatform; |
| std::unique_ptr<void, HandleDestructor> TheHandle; |
| }; |
| |
| /// A stream of computation. |
| /// |
| /// All operations enqueued on a Stream are serialized, but operations enqueued |
| /// on different Streams may run concurrently. |
| /// |
| /// Each Stream is associated with a specific, fixed device. |
| class Stream { |
| public: |
| Stream(const Stream &) = delete; |
| Stream &operator=(const Stream &) = delete; |
| Stream(Stream &&) noexcept; |
| Stream &operator=(Stream &&) noexcept; |
| ~Stream() = default; |
| |
| /// Gets the index of the device on which this Stream operates. |
| int getDeviceIndex() { return TheDeviceIndex; } |
| |
| /// Blocks the host until the Stream is done executing all previously enqueued |
| /// work. |
| /// |
| /// Returns a Status for any errors emitted by the asynchronous work on the |
| /// Stream, or by any error in the synchronization process itself. Clears the |
| /// Status state of the stream. |
| Status sync() ACXXEL_WARN_UNUSED_RESULT; |
| |
| /// Makes all future work submitted to this stream wait until the event |
| /// reports completion. |
| /// |
| /// This is useful because the event argument may be recorded on a different |
| /// stream, so this method allows for synchronization between streams without |
| /// synchronizing all streams. |
| /// |
| /// Returns a Status for any errors emitted by the asynchronous work on the |
| /// Stream, or by any error in the synchronization process itself. Clears the |
| /// Status state of the stream. |
| Status waitOnEvent(Event &Event) ACXXEL_WARN_UNUSED_RESULT; |
| |
| /// Adds a host callback function to the stream. |
| /// |
| /// The callback will be called on the host after all previously enqueued work |
| /// on the stream is complete, and no work enqueued after the callback will |
| /// begin until after the callback has finished. |
| Stream &addCallback(std::function<void(Stream &, const Status &)> Callback); |
| |
| /// \name Asynchronous device memory copies. |
| /// |
| /// These functions enqueue asynchronous memory copy operations into the |
| /// stream. Only async host memory is allowed for host arguments to these |
| /// functions. Async host memory can be created from normal host memory by |
| /// registering it with Platform::registerHostMem. AsyncHostMemory can also be |
| /// allocated directly by calling Platform::newAsyncHostMem. |
| /// |
| /// For all these functions, DeviceSrcTy must be convertible to |
| /// DeviceMemorySpan<const T>, DeviceDstTy must be convertible to |
| /// DeviceMemorySpan<T>, HostSrcTy must be convertible to |
| /// AsyncHostMemorySpan<const T> and HostDstTy must be convertible to |
| /// AsyncHostMemorySpan<T>. Additionally, the T types must match for the |
| /// destination and source. |
| /// \{ |
| |
| /// Copies from device memory to device memory. |
| template <typename DeviceSrcTy, typename DeviceDstTy> |
| Stream &asyncCopyDToD(DeviceSrcTy &&DeviceSrc, DeviceDstTy &&DeviceDst); |
| |
| /// Copies from device memory to device memory with a given element count. |
| template <typename DeviceSrcTy, typename DeviceDstTy> |
| Stream &asyncCopyDToD(DeviceSrcTy &&DeviceSrc, DeviceDstTy &&DeviceDst, |
| ptrdiff_t ElementCount); |
| |
| /// Copies from device memory to host memory. |
| template <typename DeviceSrcTy, typename HostDstTy> |
| Stream &asyncCopyDToH(DeviceSrcTy &&DeviceSrc, HostDstTy &&HostDst); |
| |
| /// Copies from device memory to host memory with a given element count. |
| template <typename DeviceSrcTy, typename HostDstTy> |
| Stream &asyncCopyDToH(DeviceSrcTy &&DeviceSrc, HostDstTy &&HostDst, |
| ptrdiff_t ElementCount); |
| |
| /// Copies from host memory to device memory. |
| template <typename HostSrcTy, typename DeviceDstTy> |
| Stream &asyncCopyHToD(HostSrcTy &&HostSrc, DeviceDstTy &&DeviceDst); |
| |
| /// Copies from host memory to device memory with a given element count. |
| template <typename HostSrcTy, typename DeviceDstTy> |
| Stream &asyncCopyHToD(HostSrcTy &&HostSrc, DeviceDstTy &DeviceDst, |
| ptrdiff_t ElementCount); |
| |
| /// \} |
| |
| /// \name Stream-synchronous device memory copies |
| /// |
| /// These functions block the host until the copy and all previously-enqueued |
| /// work on the stream has completed. |
| /// |
| /// For all these functions, DeviceSrcTy must be convertible to |
| /// DeviceMemorySpan<const T>, DeviceDstTy must be convertible to |
| /// DeviceMemorySpan<T>, HostSrcTy must be convertible to Span<const T> and |
| /// HostDstTy must be convertible to Span<T>. Additionally, the T types must |
| /// match for the destination and source. |
| /// \{ |
| |
| template <typename DeviceSrcTy, typename DeviceDstTy> |
| Stream &syncCopyDToD(DeviceSrcTy &&DeviceSrc, DeviceDstTy &&DeviceDst); |
| |
| template <typename DeviceSrcTy, typename DeviceDstTy> |
| Stream &syncCopyDToD(DeviceSrcTy &&DeviceSrc, DeviceDstTy &&DeviceDst, |
| ptrdiff_t ElementCount); |
| |
| template <typename DeviceSrcTy, typename HostDstTy> |
| Stream &syncCopyDToH(DeviceSrcTy &&DeviceSrc, HostDstTy &&HostDst); |
| |
| template <typename DeviceSrcTy, typename HostDstTy> |
| Stream &syncCopyDToH(DeviceSrcTy &&DeviceSrc, HostDstTy &&HostDst, |
| ptrdiff_t ElementCount); |
| |
| template <typename HostSrcTy, typename DeviceDstTy> |
| Stream &syncCopyHToD(HostSrcTy &&HostSrc, DeviceDstTy &&DeviceDst); |
| |
| template <typename HostSrcTy, typename DeviceDstTy> |
| Stream &syncCopyHToD(HostSrcTy &&HostSrc, DeviceDstTy &DeviceDst, |
| ptrdiff_t ElementCount); |
| |
| /// \} |
| |
| /// Enqueues an operation in the stream to set the bytes of a given device |
| /// memory region to a given value. |
| /// |
| /// DeviceDstTy must be convertible to DeviceMemorySpan<T> for non-const T. |
| template <typename DeviceDstTy> |
| Stream &asyncMemsetD(DeviceDstTy &&DeviceDst, char ByteValue); |
| |
| /// Enqueues a kernel launch operation on this stream. |
| Stream &asyncKernelLaunch(const Kernel &TheKernel, |
| KernelLaunchDimensions LaunchDimensions, |
| Span<void *> Arguments, Span<size_t> ArgumentSizes, |
| size_t SharedMemoryBytes = 0); |
| |
| /// Enqueues an event in the stream. |
| Stream &enqueueEvent(Event &E); |
| |
| // Allows implicit conversion to (CUstream_st *). This makes triple-chevron |
| // kernel calls look nicer because you can just pass a acxxel::Stream |
| // directly. |
| operator CUstream_st *() { |
| return static_cast<CUstream_st *>(TheHandle.get()); |
| } |
| |
| /// Gets the current status for the Stream and clears the Stream's status. |
| Status takeStatus() ACXXEL_WARN_UNUSED_RESULT { |
| Status OldStatus = TheStatus; |
| TheStatus = Status(); |
| return OldStatus; |
| } |
| |
| private: |
| // Only a platform can make a stream. |
| friend class Platform; |
| Stream(Platform *APlatform, int DeviceIndex, void *AHandle, |
| HandleDestructor Destructor) |
| : ThePlatform(APlatform), TheDeviceIndex(DeviceIndex), |
| TheHandle(AHandle, Destructor) {} |
| |
| const Status &setStatus(const Status &S) { |
| if (S.isError() && !TheStatus.isError()) { |
| TheStatus = S; |
| } |
| return S; |
| } |
| |
| Status takeStatusOr(const Status &S) { |
| if (TheStatus.isError()) { |
| Status OldStatus = TheStatus; |
| TheStatus = Status(); |
| return OldStatus; |
| } |
| return S; |
| } |
| |
| // The platform that created the stream. |
| Platform *ThePlatform; |
| |
| // The index of the device on which the stream operates. |
| int TheDeviceIndex; |
| |
| // A handle to the platform-specific handle implementation. |
| std::unique_ptr<void, HandleDestructor> TheHandle; |
| Status TheStatus; |
| }; |
| |
| /// A user-created event on a device. |
| /// |
| /// This is useful for setting synchronization points in a Stream. The host can |
| /// synchronize with a Stream without using events, but that requires all the |
| /// work in the Stream to be finished in order for the host to be notified. |
| /// Events provide more flexibility by allowing the host to be notified when a |
| /// single Event in the Stream is finished, rather than all the work in the |
| /// Stream. |
| class Event { |
| public: |
| Event(const Event &) = delete; |
| Event &operator=(const Event &) = delete; |
| Event(Event &&) noexcept; |
| Event &operator=(Event &&That) noexcept; |
| ~Event() = default; |
| |
| /// Checks to see if the event is done running. |
| bool isDone(); |
| |
| /// Blocks the host until the event is done. |
| Status sync(); |
| |
| /// Gets the time elapsed between the previous event's execution and this |
| /// event's execution. |
| Expected<float> getSecondsSince(const Event &Previous); |
| |
| private: |
| // Only a platform can make an event. |
| friend class Platform; |
| Event(Platform *APlatform, int DeviceIndex, void *AHandle, |
| HandleDestructor Destructor) |
| : ThePlatform(APlatform), TheDeviceIndex(DeviceIndex), |
| TheHandle(AHandle, Destructor) {} |
| |
| Platform *ThePlatform; |
| |
| // The index of the device on which the event can be enqueued. |
| int TheDeviceIndex; |
| |
| std::unique_ptr<void, HandleDestructor> TheHandle; |
| }; |
| |
| /// An accelerator platform. |
| /// |
| /// This is the base class for all platforms such as CUDA and OpenCL. It |
| /// contains many virtual methods that must be overridden by each platform |
| /// implementation. |
| /// |
| /// It also has some template wrapper functions that take care of type checking |
| /// and then forward their arguments on to raw virtual functions that are |
| /// implemented by each specific platform. |
| class Platform { |
| public: |
| virtual ~Platform(){}; |
| |
| /// Gets the number of devices for this platform in this system. |
| virtual Expected<int> getDeviceCount() = 0; |
| |
| /// Creates a stream on the given device for the platform. |
| virtual Expected<Stream> createStream(int DeviceIndex = 0) = 0; |
| |
| /// Creates an event on the given device for the platform. |
| virtual Expected<Event> createEvent(int DeviceIndex = 0) = 0; |
| |
| /// Allocates owned device memory. |
| /// |
| /// \warning This function only allocates space in device memory, it does not |
| /// call the constructor of T. |
| template <typename T> |
| Expected<DeviceMemory<T>> mallocD(ptrdiff_t ElementCount, |
| int DeviceIndex = 0) { |
| Expected<void *> MaybePointer = |
| rawMallocD(ElementCount * sizeof(T), DeviceIndex); |
| if (MaybePointer.isError()) |
| return MaybePointer.getError(); |
| return DeviceMemory<T>(this, MaybePointer.getValue(), ElementCount, |
| this->getDeviceMemoryHandleDestructor()); |
| } |
| |
| /// Creates a DeviceMemorySpan for a device symbol. |
| /// |
| /// This function is present to support __device__ variables in CUDA. Given a |
| /// pointer to a __device__ variable, this function returns a DeviceMemorySpan |
| /// referencing the device memory that stores that __device__ variable. |
| template <typename ElementType> |
| Expected<DeviceMemorySpan<ElementType>> getSymbolMemory(ElementType *Symbol, |
| int DeviceIndex = 0) { |
| Expected<void *> MaybeAddress = |
| rawGetDeviceSymbolAddress(Symbol, DeviceIndex); |
| if (MaybeAddress.isError()) |
| return MaybeAddress.getError(); |
| ElementType *Address = static_cast<ElementType *>(MaybeAddress.getValue()); |
| Expected<ptrdiff_t> MaybeSize = rawGetDeviceSymbolSize(Symbol, DeviceIndex); |
| if (MaybeSize.isError()) |
| return MaybeSize.getError(); |
| ptrdiff_t Size = MaybeSize.getValue(); |
| return DeviceMemorySpan<ElementType>(this, Address, |
| Size / sizeof(ElementType), 0); |
| } |
| |
| /// \name Host memory registration functions. |
| /// \{ |
| |
| template <typename T> |
| Expected<AsyncHostMemory<const T>> registerHostMem(Span<const T> Memory) { |
| Status S = rawRegisterHostMem(Memory.data(), Memory.size() * sizeof(T)); |
| if (S.isError()) |
| return S; |
| return AsyncHostMemory<const T>( |
| Memory.data(), Memory.size(), |
| this->getUnregisterHostMemoryHandleDestructor()); |
| } |
| |
| template <typename T> |
| Expected<AsyncHostMemory<T>> registerHostMem(Span<T> Memory) { |
| Status S = rawRegisterHostMem(Memory.data(), Memory.size() * sizeof(T)); |
| if (S.isError()) |
| return S; |
| return AsyncHostMemory<T>(Memory.data(), Memory.size(), |
| this->getUnregisterHostMemoryHandleDestructor()); |
| } |
| |
| template <typename T, size_t N> |
| Expected<AsyncHostMemory<T>> registerHostMem(T (&Array)[N]) { |
| Span<T> Span(Array); |
| Status S = rawRegisterHostMem(Span.data(), Span.size() * sizeof(T)); |
| if (S.isError()) |
| return S; |
| return AsyncHostMemory<T>(Span.data(), Span.size(), |
| this->getUnregisterHostMemoryHandleDestructor()); |
| } |
| |
| /// Registers memory stored in a container with a data() member function and |
| /// which can be converted to a Span<T*>. |
| template <typename Container> |
| auto registerHostMem(Container &Cont) -> Expected<AsyncHostMemory< |
| typename std::remove_reference<decltype(*Cont.data())>::type>> { |
| using ValueType = |
| typename std::remove_reference<decltype(*Cont.data())>::type; |
| Span<ValueType> Span(Cont); |
| Status S = rawRegisterHostMem(Span.data(), Span.size() * sizeof(ValueType)); |
| if (S.isError()) |
| return S; |
| return AsyncHostMemory<ValueType>( |
| Span.data(), Span.size(), |
| this->getUnregisterHostMemoryHandleDestructor()); |
| } |
| |
| /// Allocates an owned, registered array of objects on the host. |
| /// |
| /// Default constructs each element in the resulting array. |
| template <typename T> |
| Expected<OwnedAsyncHostMemory<T>> newAsyncHostMem(ptrdiff_t ElementCount) { |
| Expected<void *> MaybeMemory = |
| rawMallocRegisteredH(ElementCount * sizeof(T)); |
| if (MaybeMemory.isError()) |
| return MaybeMemory.getError(); |
| T *Memory = static_cast<T *>(MaybeMemory.getValue()); |
| for (ptrdiff_t I = 0; I < ElementCount; ++I) |
| new (Memory + I) T; |
| return OwnedAsyncHostMemory<T>(Memory, ElementCount, |
| this->getFreeHostMemoryHandleDestructor()); |
| } |
| |
| /// \} |
| |
| virtual Expected<Program> createProgramFromSource(Span<const char> Source, |
| int DeviceIndex = 0) = 0; |
| |
| protected: |
| friend class Stream; |
| friend class Event; |
| friend class Program; |
| template <typename T> friend class DeviceMemorySpan; |
| |
| void *getStreamHandle(Stream &Stream) { return Stream.TheHandle.get(); } |
| void *getEventHandle(Event &Event) { return Event.TheHandle.get(); } |
| |
| // Pass along access to Stream constructor to subclasses. |
| Stream constructStream(Platform *APlatform, int DeviceIndex, void *AHandle, |
| HandleDestructor Destructor) { |
| return Stream(APlatform, DeviceIndex, AHandle, Destructor); |
| } |
| |
| // Pass along access to Event constructor to subclasses. |
| Event constructEvent(Platform *APlatform, int DeviceIndex, void *AHandle, |
| HandleDestructor Destructor) { |
| return Event(APlatform, DeviceIndex, AHandle, Destructor); |
| } |
| |
| // Pass along access to Program constructor to subclasses. |
| Program constructProgram(Platform *APlatform, void *AHandle, |
| HandleDestructor Destructor) { |
| return Program(APlatform, AHandle, Destructor); |
| } |
| |
| virtual Status streamSync(void *Stream) = 0; |
| virtual Status streamWaitOnEvent(void *Stream, void *Event) = 0; |
| |
| virtual Status enqueueEvent(void *Event, void *Stream) = 0; |
| virtual bool eventIsDone(void *Event) = 0; |
| virtual Status eventSync(void *Event) = 0; |
| virtual Expected<float> getSecondsBetweenEvents(void *StartEvent, |
| void *EndEvent) = 0; |
| |
| virtual Expected<void *> rawMallocD(ptrdiff_t ByteCount, int DeviceIndex) = 0; |
| virtual HandleDestructor getDeviceMemoryHandleDestructor() = 0; |
| virtual void *getDeviceMemorySpanHandle(void *BaseHandle, size_t ByteSize, |
| size_t ByteOffset) = 0; |
| virtual void rawDestroyDeviceMemorySpanHandle(void *Handle) = 0; |
| |
| virtual Expected<void *> rawGetDeviceSymbolAddress(const void *Symbol, |
| int DeviceIndex) = 0; |
| virtual Expected<ptrdiff_t> rawGetDeviceSymbolSize(const void *Symbol, |
| int DeviceIndex) = 0; |
| |
| virtual Status rawRegisterHostMem(const void *Memory, |
| ptrdiff_t ByteCount) = 0; |
| virtual HandleDestructor getUnregisterHostMemoryHandleDestructor() = 0; |
| |
| virtual Expected<void *> rawMallocRegisteredH(ptrdiff_t ByteCount) = 0; |
| virtual HandleDestructor getFreeHostMemoryHandleDestructor() = 0; |
| |
| virtual Status asyncCopyDToD(const void *DeviceSrc, |
| ptrdiff_t DeviceSrcByteOffset, void *DeviceDst, |
| ptrdiff_t DeviceDstByteOffset, |
| ptrdiff_t ByteCount, void *Stream) = 0; |
| virtual Status asyncCopyDToH(const void *DeviceSrc, |
| ptrdiff_t DeviceSrcByteOffset, void *HostDst, |
| ptrdiff_t ByteCount, void *Stream) = 0; |
| virtual Status asyncCopyHToD(const void *HostSrc, void *DeviceDst, |
| ptrdiff_t DeviceDstByteOffset, |
| ptrdiff_t ByteCount, void *Stream) = 0; |
| |
| virtual Status asyncMemsetD(void *DeviceDst, ptrdiff_t ByteOffset, |
| ptrdiff_t ByteCount, char ByteValue, |
| void *Stream) = 0; |
| |
| virtual Status addStreamCallback(Stream &Stream, StreamCallback Callback) = 0; |
| |
| virtual Expected<void *> rawCreateKernel(void *Program, |
| const std::string &Name) = 0; |
| virtual HandleDestructor getKernelHandleDestructor() = 0; |
| |
| virtual Status rawEnqueueKernelLaunch(void *Stream, void *Kernel, |
| KernelLaunchDimensions LaunchDimensions, |
| Span<void *> Arguments, |
| Span<size_t> ArgumentSizes, |
| size_t SharedMemoryBytes) = 0; |
| }; |
| |
| // Implementation of templated Stream functions. |
| |
| template <typename DeviceSrcTy, typename DeviceDstTy> |
| Stream &Stream::asyncCopyDToD(DeviceSrcTy &&DeviceSrc, |
| DeviceDstTy &&DeviceDst) { |
| using SrcElementTy = |
| typename std::remove_reference<DeviceSrcTy>::type::value_type; |
| using DstElementTy = |
| typename std::remove_reference<DeviceDstTy>::type::value_type; |
| static_assert(std::is_same<SrcElementTy, DstElementTy>::value, |
| "asyncCopyDToD cannot copy between arrays of different types"); |
| DeviceMemorySpan<const SrcElementTy> DeviceSrcSpan(DeviceSrc); |
| DeviceMemorySpan<DstElementTy> DeviceDstSpan(DeviceDst); |
| if (DeviceSrcSpan.size() != DeviceDstSpan.size()) { |
| setStatus(Status("asyncCopyDToD source element count " + |
| std::to_string(DeviceSrcSpan.size()) + |
| " does not equal destination element count " + |
| std::to_string(DeviceDstSpan.size()))); |
| return *this; |
| } |
| setStatus(ThePlatform->asyncCopyDToD( |
| DeviceSrcSpan.baseHandle(), DeviceSrcSpan.byte_offset(), |
| DeviceDstSpan.baseHandle(), DeviceDstSpan.byte_offset(), |
| DeviceSrcSpan.byte_size(), TheHandle.get())); |
| return *this; |
| } |
| |
| template <typename DeviceSrcTy, typename DeviceDstTy> |
| Stream &Stream::asyncCopyDToD(DeviceSrcTy &&DeviceSrc, DeviceDstTy &&DeviceDst, |
| ptrdiff_t ElementCount) { |
| using SrcElementTy = |
| typename std::remove_reference<DeviceSrcTy>::type::value_type; |
| using DstElementTy = |
| typename std::remove_reference<DeviceDstTy>::type::value_type; |
| static_assert(std::is_same<SrcElementTy, DstElementTy>::value, |
| "asyncCopyDToD cannot copy between arrays of different types"); |
| DeviceMemorySpan<const SrcElementTy> DeviceSrcSpan(DeviceSrc); |
| DeviceMemorySpan<DstElementTy> DeviceDstSpan(DeviceDst); |
| if (DeviceSrcSpan.size() < ElementCount) { |
| setStatus(Status("asyncCopyDToD source element count " + |
| std::to_string(DeviceSrcSpan.size()) + |
| " is less than requested element count " + |
| std::to_string(ElementCount))); |
| return *this; |
| } |
| if (DeviceDstSpan.size() < ElementCount) { |
| setStatus(Status("asyncCopyDToD destination element count " + |
| std::to_string(DeviceDst.size()) + |
| " is less than requested element count " + |
| std::to_string(ElementCount))); |
| return *this; |
| } |
| setStatus(ThePlatform->asyncCopyDToD( |
| DeviceSrcSpan.baseHandle(), DeviceSrcSpan.byte_offset(), |
| DeviceDstSpan.baseHandle(), DeviceDstSpan.byte_offset(), |
| ElementCount * sizeof(SrcElementTy), TheHandle.get())); |
| return *this; |
| } |
| |
| template <typename DeviceSrcTy, typename HostDstTy> |
| Stream &Stream::asyncCopyDToH(DeviceSrcTy &&DeviceSrc, HostDstTy &&HostDst) { |
| using SrcElementTy = |
| typename std::remove_reference<DeviceSrcTy>::type::value_type; |
| DeviceMemorySpan<const SrcElementTy> DeviceSrcSpan(DeviceSrc); |
| AsyncHostMemorySpan<SrcElementTy> HostDstSpan(HostDst); |
| if (DeviceSrcSpan.size() != HostDstSpan.size()) { |
| setStatus(Status("asyncCopyDToH source element count " + |
| std::to_string(DeviceSrcSpan.size()) + |
| " does not equal destination element count " + |
| std::to_string(HostDstSpan.size()))); |
| return *this; |
| } |
| setStatus(ThePlatform->asyncCopyDToH( |
| DeviceSrcSpan.baseHandle(), DeviceSrcSpan.byte_offset(), |
| HostDstSpan.data(), DeviceSrcSpan.byte_size(), TheHandle.get())); |
| return *this; |
| } |
| |
| template <typename DeviceSrcTy, typename HostDstTy> |
| Stream &Stream::asyncCopyDToH(DeviceSrcTy &&DeviceSrc, HostDstTy &&HostDst, |
| ptrdiff_t ElementCount) { |
| using SrcElementTy = |
| typename std::remove_reference<DeviceSrcTy>::type::value_type; |
| DeviceMemorySpan<const SrcElementTy> DeviceSrcSpan(DeviceSrc); |
| AsyncHostMemorySpan<SrcElementTy> HostDstSpan(HostDst); |
| if (DeviceSrcSpan.size() < ElementCount) { |
| setStatus(Status("asyncCopyDToH source element count " + |
| std::to_string(DeviceSrcSpan.size()) + |
| " is less than requested element count " + |
| std::to_string(ElementCount))); |
| return *this; |
| } |
| if (HostDstSpan.size() < ElementCount) { |
| setStatus(Status("asyncCopyDToH destination element count " + |
| std::to_string(HostDstSpan.size()) + |
| " is less than requested element count " + |
| std::to_string(ElementCount))); |
| return *this; |
| } |
| setStatus(ThePlatform->asyncCopyDToH( |
| DeviceSrcSpan.baseHandle(), DeviceSrcSpan.byte_offset(), |
| HostDstSpan.data(), ElementCount * sizeof(SrcElementTy), |
| TheHandle.get())); |
| return *this; |
| } |
| |
| template <typename HostSrcTy, typename DeviceDstTy> |
| Stream &Stream::asyncCopyHToD(HostSrcTy &&HostSrc, DeviceDstTy &&DeviceDst) { |
| using DstElementTy = |
| typename std::remove_reference<DeviceDstTy>::type::value_type; |
| AsyncHostMemorySpan<const DstElementTy> HostSrcSpan(HostSrc); |
| DeviceMemorySpan<DstElementTy> DeviceDstSpan(DeviceDst); |
| if (HostSrcSpan.size() != DeviceDstSpan.size()) { |
| setStatus(Status("asyncCopyHToD source element count " + |
| std::to_string(HostSrcSpan.size()) + |
| " does not equal destination element count " + |
| std::to_string(DeviceDstSpan.size()))); |
| return *this; |
| } |
| setStatus(ThePlatform->asyncCopyHToD( |
| HostSrcSpan.data(), DeviceDstSpan.baseHandle(), |
| DeviceDstSpan.byte_offset(), HostSrcSpan.byte_size(), TheHandle.get())); |
| return *this; |
| } |
| |
| template <typename HostSrcTy, typename DeviceDstTy> |
| Stream &Stream::asyncCopyHToD(HostSrcTy &&HostSrc, DeviceDstTy &DeviceDst, |
| ptrdiff_t ElementCount) { |
| using DstElementTy = |
| typename std::remove_reference<DeviceDstTy>::type::value_type; |
| AsyncHostMemorySpan<const DstElementTy> HostSrcSpan(HostSrc); |
| DeviceMemorySpan<DstElementTy> DeviceDstSpan(DeviceDst); |
| if (HostSrcSpan.size() < ElementCount) { |
| setStatus(Status("copyHToD source element count " + |
| std::to_string(HostSrcSpan.size()) + |
| " is less than requested element count " + |
| std::to_string(ElementCount))); |
| return *this; |
| } |
| if (DeviceDstSpan.size() < ElementCount) { |
| setStatus(Status("copyHToD destination element count " + |
| std::to_string(DeviceDstSpan.size()) + |
| " is less than requested element count " + |
| std::to_string(ElementCount))); |
| return *this; |
| } |
| setStatus(ThePlatform->asyncCopyHToD( |
| HostSrcSpan.data(), DeviceDstSpan.baseHandle(), |
| DeviceDstSpan.byte_offset(), ElementCount * sizeof(DstElementTy), |
| TheHandle.get())); |
| return *this; |
| } |
| |
| template <typename DeviceDstTy> |
| Stream &Stream::asyncMemsetD(DeviceDstTy &&DeviceDst, char ByteValue) { |
| using DstElementTy = |
| typename std::remove_reference<DeviceDstTy>::type::value_type; |
| DeviceMemorySpan<DstElementTy> DeviceDstSpan(DeviceDst); |
| setStatus(ThePlatform->asyncMemsetD( |
| DeviceDstSpan.baseHandle(), DeviceDstSpan.byte_offset(), |
| DeviceDstSpan.byte_size(), ByteValue, TheHandle.get())); |
| return *this; |
| } |
| |
| template <typename DeviceSrcTy, typename DeviceDstTy> |
| Stream &Stream::syncCopyDToD(DeviceSrcTy &&DeviceSrc, DeviceDstTy &&DeviceDst) { |
| using SrcElementTy = |
| typename std::remove_reference<DeviceSrcTy>::type::value_type; |
| using DstElementTy = |
| typename std::remove_reference<DeviceDstTy>::type::value_type; |
| static_assert(std::is_same<SrcElementTy, DstElementTy>::value, |
| "copyDToD cannot copy between arrays of different types"); |
| DeviceMemorySpan<const SrcElementTy> DeviceSrcSpan(DeviceSrc); |
| DeviceMemorySpan<DstElementTy> DeviceDstSpan(DeviceDst); |
| if (DeviceSrcSpan.size() != DeviceDstSpan.size()) { |
| setStatus(Status("copyDToD source element count " + |
| std::to_string(DeviceSrcSpan.size()) + |
| " does not equal destination element count " + |
| std::to_string(DeviceDstSpan.size()))); |
| return *this; |
| } |
| if (setStatus(ThePlatform->asyncCopyDToD( |
| DeviceSrcSpan.baseHandle(), DeviceSrcSpan.byte_offset(), |
| DeviceDstSpan.baseHandle(), DeviceDstSpan.byte_offset(), |
| DeviceSrcSpan.byte_size(), TheHandle.get())) |
| .isError()) { |
| return *this; |
| } |
| setStatus(sync()); |
| return *this; |
| } |
| |
| template <typename DeviceSrcTy, typename DeviceDstTy> |
| Stream &Stream::syncCopyDToD(DeviceSrcTy &&DeviceSrc, DeviceDstTy &&DeviceDst, |
| ptrdiff_t ElementCount) { |
| using SrcElementTy = |
| typename std::remove_reference<DeviceSrcTy>::type::value_type; |
| using DstElementTy = |
| typename std::remove_reference<DeviceDstTy>::type::value_type; |
| static_assert(std::is_same<SrcElementTy, DstElementTy>::value, |
| "copyDToD cannot copy between arrays of different types"); |
| DeviceMemorySpan<const SrcElementTy> DeviceSrcSpan(DeviceSrc); |
| DeviceMemorySpan<DstElementTy> DeviceDstSpan(DeviceDst); |
| if (DeviceSrcSpan.size() < ElementCount) { |
| setStatus(Status("copyDToD source element count " + |
| std::to_string(DeviceSrcSpan.size()) + |
| " is less than requested element count " + |
| std::to_string(ElementCount))); |
| return *this; |
| } |
| if (DeviceDstSpan.size() < ElementCount) { |
| setStatus(Status("copyDToD destination element count " + |
| std::to_string(DeviceDst.size()) + |
| " is less than requested element count " + |
| std::to_string(ElementCount))); |
| return *this; |
| } |
| if (setStatus(ThePlatform->asyncCopyDToD( |
| DeviceSrcSpan.baseHandle(), DeviceSrcSpan.byte_offset(), |
| DeviceDstSpan.baseHandle(), DeviceDstSpan.byte_offset(), |
| ElementCount * sizeof(SrcElementTy), TheHandle.get())) |
| .isError()) { |
| return *this; |
| } |
| setStatus(sync()); |
| return *this; |
| } |
| |
| template <typename DeviceSrcTy, typename HostDstTy> |
| Stream &Stream::syncCopyDToH(DeviceSrcTy &&DeviceSrc, HostDstTy &&HostDst) { |
| using SrcElementTy = |
| typename std::remove_reference<DeviceSrcTy>::type::value_type; |
| DeviceMemorySpan<const SrcElementTy> DeviceSrcSpan(DeviceSrc); |
| Span<SrcElementTy> HostDstSpan(HostDst); |
| if (DeviceSrcSpan.size() != HostDstSpan.size()) { |
| setStatus(Status("copyDToH source element count " + |
| std::to_string(DeviceSrcSpan.size()) + |
| " does not equal destination element count " + |
| std::to_string(HostDstSpan.size()))); |
| return *this; |
| } |
| if (setStatus(ThePlatform->asyncCopyDToH( |
| DeviceSrcSpan.baseHandle(), DeviceSrcSpan.byte_offset(), |
| HostDstSpan.data(), DeviceSrcSpan.byte_size(), |
| TheHandle.get())) |
| .isError()) { |
| return *this; |
| } |
| setStatus(sync()); |
| return *this; |
| } |
| |
| template <typename DeviceSrcTy, typename HostDstTy> |
| Stream &Stream::syncCopyDToH(DeviceSrcTy &&DeviceSrc, HostDstTy &&HostDst, |
| ptrdiff_t ElementCount) { |
| using SrcElementTy = |
| typename std::remove_reference<DeviceSrcTy>::type::value_type; |
| DeviceMemorySpan<const SrcElementTy> DeviceSrcSpan(DeviceSrc); |
| Span<SrcElementTy> HostDstSpan(HostDst); |
| if (DeviceSrcSpan.size() < ElementCount) { |
| setStatus(Status("copyDToH source element count " + |
| std::to_string(DeviceSrcSpan.size()) + |
| " is less than requested element count " + |
| std::to_string(ElementCount))); |
| return *this; |
| } |
| if (HostDstSpan.size() < ElementCount) { |
| setStatus(Status("copyDToH destination element count " + |
| std::to_string(HostDstSpan.size()) + |
| " is less than requested element count " + |
| std::to_string(ElementCount))); |
| return *this; |
| } |
| if (setStatus(ThePlatform->asyncCopyDToH( |
| DeviceSrcSpan.baseHandle(), DeviceSrcSpan.byte_offset(), |
| HostDstSpan.data(), ElementCount * sizeof(SrcElementTy), |
| TheHandle.get())) |
| .isError()) { |
| return *this; |
| } |
| setStatus(sync()); |
| return *this; |
| } |
| |
| template <typename HostSrcTy, typename DeviceDstTy> |
| Stream &Stream::syncCopyHToD(HostSrcTy &&HostSrc, DeviceDstTy &&DeviceDst) { |
| using DstElementTy = |
| typename std::remove_reference<DeviceDstTy>::type::value_type; |
| Span<const DstElementTy> HostSrcSpan(HostSrc); |
| DeviceMemorySpan<DstElementTy> DeviceDstSpan(DeviceDst); |
| if (HostSrcSpan.size() != DeviceDstSpan.size()) { |
| setStatus(Status("copyHToD source element count " + |
| std::to_string(HostSrcSpan.size()) + |
| " does not equal destination element count " + |
| std::to_string(DeviceDstSpan.size()))); |
| return *this; |
| } |
| if (setStatus(ThePlatform->asyncCopyHToD( |
| HostSrcSpan.data(), DeviceDstSpan.baseHandle(), |
| DeviceDstSpan.byte_offset(), DeviceDstSpan.byte_size(), |
| TheHandle.get())) |
| .isError()) { |
| return *this; |
| } |
| setStatus(sync()); |
| return *this; |
| } |
| |
| template <typename HostSrcTy, typename DeviceDstTy> |
| Stream &Stream::syncCopyHToD(HostSrcTy &&HostSrc, DeviceDstTy &DeviceDst, |
| ptrdiff_t ElementCount) { |
| using DstElementTy = |
| typename std::remove_reference<DeviceDstTy>::type::value_type; |
| Span<const DstElementTy> HostSrcSpan(HostSrc); |
| DeviceMemorySpan<DstElementTy> DeviceDstSpan(DeviceDst); |
| if (HostSrcSpan.size() < ElementCount) { |
| setStatus(Status("copyHToD source element count " + |
| std::to_string(HostSrcSpan.size()) + |
| " is less than requested element count " + |
| std::to_string(ElementCount))); |
| return *this; |
| } |
| if (DeviceDstSpan.size() < ElementCount) { |
| setStatus(Status("copyHToD destination element count " + |
| std::to_string(DeviceDstSpan.size()) + |
| " is less than requested element count " + |
| std::to_string(ElementCount))); |
| return *this; |
| } |
| if (setStatus(ThePlatform->asyncCopyHToD( |
| HostSrcSpan.data(), DeviceDstSpan.baseHandle(), |
| DeviceDstSpan.byte_offset(), |
| ElementCount * sizeof(DstElementTy), TheHandle.get())) |
| .isError()) { |
| return *this; |
| } |
| setStatus(sync()); |
| return *this; |
| } |
| |
| /// Owned device memory. |
| /// |
| /// Device memory that frees itself when it goes out of scope. |
| template <typename ElementType> class DeviceMemory { |
| public: |
| using element_type = ElementType; |
| using index_type = std::ptrdiff_t; |
| using value_type = typename std::remove_const<element_type>::type; |
| |
| DeviceMemory(const DeviceMemory &) = delete; |
| DeviceMemory &operator=(const DeviceMemory &) = delete; |
| DeviceMemory(DeviceMemory &&) noexcept; |
| DeviceMemory &operator=(DeviceMemory &&) noexcept; |
| ~DeviceMemory() = default; |
| |
| /// Gets the raw base handle for the underlying platform implementation. |
| void *handle() const { return ThePointer.get(); } |
| |
| index_type length() const { return TheSize; } |
| index_type size() const { return TheSize; } |
| index_type byte_size() const { // NOLINT |
| return TheSize * sizeof(element_type); |
| } |
| bool empty() const { return TheSize == 0; } |
| |
| // These conversion operators are useful for making triple-chevron kernel |
| // launches more concise. |
| operator element_type *() { |
| return static_cast<element_type *>(ThePointer.get()); |
| } |
| operator const element_type *() const { return ThePointer.get(); } |
| |
| /// Converts a const object to a DeviceMemorySpan of const elements. |
| DeviceMemorySpan<const element_type> asSpan() const { |
| return DeviceMemorySpan<const element_type>( |
| ThePlatform, static_cast<const element_type *>(ThePointer.get()), |
| TheSize, 0); |
| } |
| |
| /// Converts an object to a DeviceMemorySpan. |
| DeviceMemorySpan<element_type> asSpan() { |
| return DeviceMemorySpan<element_type>( |
| ThePlatform, static_cast<element_type *>(ThePointer.get()), TheSize, 0); |
| } |
| |
| private: |
| friend class Platform; |
| template <typename T> friend class DeviceMemorySpan; |
| |
| DeviceMemory(Platform *ThePlatform, void *Pointer, index_type ElementCount, |
| HandleDestructor Destructor) |
| : ThePlatform(ThePlatform), ThePointer(Pointer, Destructor), |
| TheSize(ElementCount) {} |
| |
| Platform *ThePlatform; |
| std::unique_ptr<void, HandleDestructor> ThePointer; |
| ptrdiff_t TheSize; |
| }; |
| |
| template <typename T> |
| DeviceMemory<T>::DeviceMemory(DeviceMemory &&) noexcept = default; |
| template <typename T> |
| DeviceMemory<T> &DeviceMemory<T>::operator=(DeviceMemory &&) noexcept = default; |
| |
| /// View into device memory. |
| /// |
| /// Like a Span, but for device memory rather than host memory. |
| template <typename ElementType> class DeviceMemorySpan { |
| public: |
| /// \name constants and types |
| /// \{ |
| using element_type = ElementType; |
| using index_type = std::ptrdiff_t; |
| using pointer = element_type *; |
| using reference = element_type &; |
| using iterator = element_type *; |
| using const_iterator = const element_type *; |
| using value_type = typename std::remove_const<element_type>::type; |
| /// \} |
| |
| DeviceMemorySpan() |
| : ThePlatform(nullptr), TheHandle(nullptr), TheSize(0), TheOffset(0), |
| TheSpanHandle(nullptr) {} |
| |
| // Intentionally implicit. |
| template <typename OtherElementType> |
| DeviceMemorySpan(DeviceMemorySpan<OtherElementType> &ASpan) |
| : ThePlatform(ASpan.ThePlatform), |
| TheHandle(static_cast<pointer>(ASpan.baseHandle())), |
| TheSize(ASpan.size()), TheOffset(ASpan.offset()), |
| TheSpanHandle(nullptr) {} |
| |
| // Intentionally implicit. |
| template <typename OtherElementType> |
| DeviceMemorySpan(DeviceMemorySpan<OtherElementType> &&ASpan) |
| : ThePlatform(ASpan.ThePlatform), |
| TheHandle(static_cast<pointer>(ASpan.baseHandle())), |
| TheSize(ASpan.size()), TheOffset(ASpan.offset()), |
| TheSpanHandle(nullptr) {} |
| |
| // Intentionally implicit. |
| template <typename OtherElementType> |
| DeviceMemorySpan(DeviceMemory<OtherElementType> &Memory) |
| : ThePlatform(Memory.ThePlatform), |
| TheHandle(static_cast<value_type *>(Memory.handle())), |
| TheSize(Memory.size()), TheOffset(0), TheSpanHandle(nullptr) {} |
| |
| ~DeviceMemorySpan() { |
| if (TheSpanHandle) { |
| ThePlatform->rawDestroyDeviceMemorySpanHandle( |
| const_cast<value_type *>(TheSpanHandle)); |
| } |
| } |
| |
| /// \name observers |
| /// \{ |
| index_type length() const { return TheSize; } |
| index_type size() const { return TheSize; } |
| index_type byte_size() const { // NOLINT |
| return TheSize * sizeof(element_type); |
| } |
| index_type offset() const { return TheOffset; } |
| index_type byte_offset() const { // NOLINT |
| return TheOffset * sizeof(element_type); |
| } |
| bool empty() const { return TheSize == 0; } |
| /// \} |
| |
| void *baseHandle() const { |
| return static_cast<void *>(const_cast<value_type *>(TheHandle)); |
| } |
| |
| /// Casts to a host memory pointer. |
| /// |
| /// This is only guaranteed to make sense for the CUDA platform, where device |
| /// pointers can be stored and manipulated much like host pointers. This makes |
| /// it easy to do triple-chevron kernel launches in CUDA because |
| /// DeviceMemorySpan values can be passed to parameters expecting regular |
| /// pointers. |
| /// |
| /// If the CUDA platform is using unified memory, it may also be possible to |
| /// dereference this pointer on the host. |
| /// |
| /// For platforms other than CUDA, this may return a garbage pointer. |
| operator element_type *() const { |
| if (!TheSpanHandle) |
| TheSpanHandle = ThePlatform->getDeviceMemorySpanHandle( |
| TheHandle, TheSize * sizeof(element_type), |
| TheOffset * sizeof(element_type)); |
| return TheSpanHandle; |
| } |
| |
| DeviceMemorySpan<element_type> first(index_type Count) const { |
| bool Valid = Count >= 0 && Count <= TheSize; |
| if (!Valid) |
| std::terminate(); |
| return DeviceMemorySpan<element_type>(ThePlatform, TheHandle, Count, |
| TheOffset); |
| } |
| |
| DeviceMemorySpan<element_type> last(index_type Count) const { |
| bool Valid = Count >= 0 && Count <= TheSize; |
| if (!Valid) |
| std::terminate(); |
| return DeviceMemorySpan<element_type>(ThePlatform, TheHandle, Count, |
| TheOffset + TheSize - Count); |
| } |
| |
| DeviceMemorySpan<element_type> |
| subspan(index_type Offset, index_type Count = dynamic_extent) const { |
| bool Valid = |
| (Offset == 0 || (Offset > 0 && Offset <= TheSize)) && |
| (Count == dynamic_extent || (Count >= 0 && Offset + Count <= TheSize)); |
| if (!Valid) |
| std::terminate(); |
| return DeviceMemorySpan<element_type>(ThePlatform, TheHandle, Count, |
| TheOffset + Offset); |
| } |
| |
| private: |
| template <typename T> friend class DeviceMemory; |
| template <typename T> friend class DeviceMemorySpan; |
| friend class Platform; |
| |
| DeviceMemorySpan(Platform *ThePlatform, pointer AHandle, index_type Size, |
| index_type Offset) |
| : ThePlatform(ThePlatform), TheHandle(AHandle), TheSize(Size), |
| TheOffset(Offset), TheSpanHandle(nullptr) {} |
| |
| Platform *ThePlatform; |
| pointer TheHandle; |
| index_type TheSize; |
| index_type TheOffset; |
| pointer TheSpanHandle; |
| }; |
| |
| /// Asynchronous host memory. |
| /// |
| /// This memory is pinned or otherwise registered in the host memory space to |
| /// allow for asynchronous copies between it and device memory. |
| /// |
| /// This memory unpins/unregisters itself when it goes out of scope, but does |
| /// not free itself. |
| template <typename ElementType> class AsyncHostMemory { |
| public: |
| using value_type = ElementType; |
| using remove_const_type = typename std::remove_const<ElementType>::type; |
| |
| AsyncHostMemory(const AsyncHostMemory &) = delete; |
| AsyncHostMemory &operator=(const AsyncHostMemory &) = delete; |
| AsyncHostMemory(AsyncHostMemory &&) noexcept; |
| AsyncHostMemory &operator=(AsyncHostMemory &&) noexcept; |
| ~AsyncHostMemory() = default; |
| |
| template <typename OtherElementType> |
| AsyncHostMemory(AsyncHostMemory<OtherElementType> &&Other) |
| : ThePointer(std::move(Other.ThePointer)), |
| TheElementCount(Other.TheElementCount) { |
| static_assert( |
| std::is_assignable<ElementType *, OtherElementType *>::value, |
| "cannot assign OtherElementType pointer to ElementType pointer type"); |
| } |
| |
| ElementType *data() const { |
| return const_cast<ElementType *>( |
| static_cast<remove_const_type *>(ThePointer.get())); |
| } |
| ptrdiff_t size() const { return TheElementCount; } |
| |
| private: |
| template <typename U> friend class AsyncHostMemory; |
| friend class Platform; |
| AsyncHostMemory(ElementType *Pointer, ptrdiff_t ElementCount, |
| HandleDestructor Destructor) |
| : ThePointer( |
| static_cast<void *>(const_cast<remove_const_type *>(Pointer)), |
| Destructor), |
| TheElementCount(ElementCount) {} |
| |
| std::unique_ptr<void, HandleDestructor> ThePointer; |
| ptrdiff_t TheElementCount; |
| }; |
| |
| template <typename T> |
| AsyncHostMemory<T>::AsyncHostMemory(AsyncHostMemory &&) noexcept = default; |
| template <typename T> |
| AsyncHostMemory<T> &AsyncHostMemory<T>:: |
| operator=(AsyncHostMemory &&) noexcept = default; |
| |
| /// Owned registered host memory. |
| /// |
| /// Like AsyncHostMemory, but this memory also frees itself in addition to |
| /// unpinning/unregistering itself when it goes out of scope. |
| template <typename ElementType> class OwnedAsyncHostMemory { |
| public: |
| using remove_const_type = typename std::remove_const<ElementType>::type; |
| |
| OwnedAsyncHostMemory(const OwnedAsyncHostMemory &) = delete; |
| OwnedAsyncHostMemory &operator=(const OwnedAsyncHostMemory &) = delete; |
| OwnedAsyncHostMemory(OwnedAsyncHostMemory &&) noexcept; |
| OwnedAsyncHostMemory &operator=(OwnedAsyncHostMemory &&) noexcept; |
| |
| ~OwnedAsyncHostMemory() { |
| if (ThePointer.get()) { |
| // We use placement new to construct these objects, so we have to call the |
| // destructors explicitly. |
| for (ptrdiff_t I = 0; I < TheElementCount; ++I) |
| static_cast<ElementType *>(ThePointer.get())[I].~ElementType(); |
| } |
| } |
| |
| ElementType *get() const { |
| return const_cast<ElementType *>( |
| static_cast<remove_const_type *>(ThePointer.get())); |
| } |
| |
| ElementType &operator[](ptrdiff_t I) const { |
| assert(I >= 0 && I < TheElementCount); |
| return get()[I]; |
| } |
| |
| private: |
| template <typename T> friend class AsyncHostMemorySpan; |
| |
| friend class Platform; |
| |
| OwnedAsyncHostMemory(void *Memory, ptrdiff_t ElementCount, |
| HandleDestructor Destructor) |
| : ThePointer(Memory, Destructor), TheElementCount(ElementCount) {} |
| |
| std::unique_ptr<void, HandleDestructor> ThePointer; |
| ptrdiff_t TheElementCount; |
| }; |
| |
| template <typename T> |
| OwnedAsyncHostMemory<T>::OwnedAsyncHostMemory( |
| OwnedAsyncHostMemory &&) noexcept = default; |
| template <typename T> |
| OwnedAsyncHostMemory<T> &OwnedAsyncHostMemory<T>:: |
| operator=(OwnedAsyncHostMemory &&) noexcept = default; |
| |
| /// View into registered host memory. |
| /// |
| /// Like Span but for registered host memory. |
| template <typename ElementType> class AsyncHostMemorySpan { |
| public: |
| /// \name constants and types |
| /// \{ |
| using element_type = ElementType; |
| using index_type = std::ptrdiff_t; |
| using pointer = element_type *; |
| using reference = element_type &; |
| using iterator = element_type *; |
| using const_iterator = const element_type *; |
| using value_type = typename std::remove_const<element_type>::type; |
| /// \} |
| |
| AsyncHostMemorySpan() : TheSpan() {} |
| |
| // Intentionally implicit. |
| template <typename OtherElementType> |
| AsyncHostMemorySpan(AsyncHostMemory<OtherElementType> &Memory) |
| : TheSpan(Memory.data(), Memory.size()) {} |
| |
| // Intentionally implicit. |
| template <typename OtherElementType> |
| AsyncHostMemorySpan(OwnedAsyncHostMemory<OtherElementType> &Owned) |
| : TheSpan(Owned.get(), Owned.TheElementCount) {} |
| |
| // Intentionally implicit. |
| template <typename OtherElementType> |
| AsyncHostMemorySpan(AsyncHostMemorySpan<OtherElementType> &ASpan) |
| : TheSpan(ASpan) {} |
| |
| // Intentionally implicit. |
| template <typename OtherElementType> |
| AsyncHostMemorySpan(AsyncHostMemorySpan<OtherElementType> &&Span) |
| : TheSpan(Span) {} |
| |
| /// \name observers |
| /// \{ |
| index_type length() const { return TheSpan.length(); } |
| index_type size() const { return TheSpan.size(); } |
| index_type byte_size() const { // NOLINT |
| return TheSpan.size() * sizeof(element_type); |
| } |
| bool empty() const { return TheSpan.empty(); } |
| /// \} |
| |
| pointer data() const noexcept { return TheSpan.data(); } |
| operator element_type *() const { return TheSpan.data(); } |
| |
| AsyncHostMemorySpan<element_type> first(index_type Count) const { |
| return AsyncHostMemorySpan<element_type>(TheSpan.first(Count)); |
| } |
| |
| AsyncHostMemorySpan<element_type> last(index_type Count) const { |
| return AsyncHostMemorySpan<element_type>(TheSpan.last(Count)); |
| } |
| |
| AsyncHostMemorySpan<element_type> |
| subspan(index_type Offset, index_type Count = dynamic_extent) const { |
| return AsyncHostMemorySpan<element_type>(TheSpan.subspan(Offset, Count)); |
| } |
| |
| private: |
| template <typename T> friend class AsyncHostMemory; |
| |
| explicit AsyncHostMemorySpan(Span<ElementType> ArraySpan) |
| : TheSpan(ArraySpan) {} |
| |
| Span<ElementType> TheSpan; |
| }; |
| |
| } // namespace acxxel |
| |
| #endif // ACXXEL_ACXXEL_H |