| /* |
| * Copyright (c) 2007-2017, Intel Corporation |
| * |
| * Permission is hereby granted, free of charge, to any person obtaining a |
| * copy of this software and associated documentation files (the "Software"), |
| * to deal in the Software without restriction, including without limitation |
| * the rights to use, copy, modify, merge, publish, distribute, sublicense, |
| * and/or sell copies of the Software, and to permit persons to whom the |
| * Software is furnished to do so, subject to the following conditions: |
| * |
| * The above copyright notice and this permission notice shall be included |
| * in all copies or substantial portions of the Software. |
| * |
| * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS |
| * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
| * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL |
| * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR |
| * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, |
| * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR |
| * OTHER DEALINGS IN THE SOFTWARE. |
| */ |
| //! |
| //! \file cm_queue_rt.cpp |
| //! \brief Contains CmQueueRT implementations. |
| //! |
| |
| #include "cm_queue_rt.h" |
| |
| #include "cm_mem.h" |
| #include "cm_device_rt.h" |
| #include "cm_event_rt.h" |
| #include "cm_task_rt.h" |
| #include "cm_task_internal.h" |
| #include "cm_thread_space_rt.h" |
| #include "cm_kernel_rt.h" |
| #include "cm_kernel_data.h" |
| #include "cm_buffer_rt.h" |
| #include "cm_group_space.h" |
| #include "cm_vebox_data.h" |
| #include "cm_surface_manager.h" |
| #include "cm_surface_2d_rt.h" |
| #include "cm_vebox_rt.h" |
| #include "cm_execution_adv.h" |
| |
| // Used by GPUCopy |
| #define BLOCK_PIXEL_WIDTH (32) |
| #define BLOCK_HEIGHT (8) |
| #define BLOCK_HEIGHT_NV12 (4) |
| #define SUB_BLOCK_PIXEL_WIDTH (8) |
| #define SUB_BLOCK_HEIGHT (8) |
| #define SUB_BLOCK_HEIGHT_NV12 (4) |
| #define INNER_LOOP (4) |
| #define BYTE_COPY_ONE_THREAD (1024*INNER_LOOP) //4K for each thread |
| #define THREAD_SPACE_WIDTH_INCREMENT (8) |
| //Used by unaligned copy |
| #define BLOCK_WIDTH (64) |
| #define PAGE_ALIGNED (0x1000) |
| |
| #define GPUCOPY_KERNEL_LOCK(a) ((a)->locked = true) |
| #define GPUCOPY_KERNEL_UNLOCK(a) ((a)->locked = false) |
| |
| namespace CMRT_UMD |
| { |
| //*----------------------------------------------------------------------------- |
| //| Purpose: Create Queue |
| //| Returns: Result of the operation. |
| //*----------------------------------------------------------------------------- |
| int32_t CmQueueRT::Create(CmDeviceRT *device, |
| CmQueueRT* &queue, |
| CM_QUEUE_CREATE_OPTION queueCreateOption) |
| { |
| int32_t result = CM_SUCCESS; |
| queue = new (std::nothrow) CmQueueRT(device, queueCreateOption); |
| if( queue ) |
| { |
| result = queue->Initialize( ); |
| if( result != CM_SUCCESS ) |
| { |
| CmQueueRT::Destroy( queue); |
| } |
| } |
| else |
| { |
| CM_ASSERTMESSAGE("Error: Failed to create CmQueue due to out of system memory."); |
| result = CM_OUT_OF_HOST_MEMORY; |
| } |
| return result; |
| } |
| |
| //*----------------------------------------------------------------------------- |
| //| Purpose: Destroy Queue |
| //| Returns: Result of the operation. |
| //*----------------------------------------------------------------------------- |
| int32_t CmQueueRT::Destroy(CmQueueRT* &queue ) |
| { |
| if( queue == nullptr ) |
| { |
| return CM_FAILURE; |
| } |
| |
| uint32_t result = queue->CleanQueue(); |
| CmSafeDelete( queue ); |
| |
| return result; |
| } |
| |
| //*----------------------------------------------------------------------------- |
| //| Purpose: Constructor of Cm Queue |
| //| Returns: Result of the operation. |
| //*----------------------------------------------------------------------------- |
| CmQueueRT::CmQueueRT(CmDeviceRT *device, |
| CM_QUEUE_CREATE_OPTION queueCreateOption): |
| m_device(device), |
| m_eventArray(CM_INIT_EVENT_COUNT), |
| m_eventCount(0), |
| m_copyKernelParamArray(CM_INIT_GPUCOPY_KERNL_COUNT), |
| m_copyKernelParamArrayCount(0), |
| m_halMaxValues(nullptr), |
| m_queueOption(queueCreateOption), |
| m_usingVirtualEngine(false), |
| m_osSyncEvent(nullptr), |
| m_trackerIndex(0), |
| m_fastTrackerIndex(0), |
| m_streamIndex(0) |
| { |
| MOS_ZeroMemory(&m_mosVeHintParams, sizeof(m_mosVeHintParams)); |
| } |
| |
| //*----------------------------------------------------------------------------- |
| //| Purpose: Destructor of Cm Queue |
| //| Returns: Result of the operation. |
| //*----------------------------------------------------------------------------- |
| CmQueueRT::~CmQueueRT() |
| { |
| m_osSyncEvent = nullptr; |
| uint32_t eventArrayUsedSize = m_eventArray.GetMaxSize(); |
| for( uint32_t i = 0; i < eventArrayUsedSize; i ++ ) |
| { |
| CmEventRT* event = (CmEventRT*)m_eventArray.GetElement( i ); |
| uint32_t eventReleaseTimes = 0; |
| while( event ) |
| { // destroy the event no matter if it is released by user |
| if(eventReleaseTimes > 2) |
| { |
| // The max of event's reference cout is 2 |
| // if the event is not released after 2 times, there is something wrong |
| CM_ASSERTMESSAGE("Error: The max of event's reference cout is 2."); |
| break; |
| } |
| CmEventRT::Destroy( event ); |
| eventReleaseTimes ++; |
| } |
| } |
| m_eventArray.Delete(); |
| |
| // Do not destroy the kernel in m_copyKernelParamArray. |
| // They have been destoyed in ~CmDevice() before destroying Queue |
| for( uint32_t i = 0; i < m_copyKernelParamArrayCount; i ++ ) |
| { |
| CM_GPUCOPY_KERNEL *gpuCopyParam = (CM_GPUCOPY_KERNEL*)m_copyKernelParamArray.GetElement( i ); |
| CmSafeDelete(gpuCopyParam); |
| } |
| |
| m_copyKernelParamArray.Delete(); |
| |
| } |
| |
| //*----------------------------------------------------------------------------- |
| //| Purpose: Initialize Cm Queue |
| //| Returns: Result of the operation. |
| //*----------------------------------------------------------------------------- |
| int32_t CmQueueRT::Initialize() |
| { |
| PCM_HAL_STATE cmHalState = ((PCM_CONTEXT_DATA)m_device->GetAccelData())->cmHalState; |
| CM_HAL_MAX_VALUES_EX* halMaxValuesEx = nullptr; |
| CM_RETURN_CODE hr = CM_SUCCESS; |
| m_device->GetHalMaxValues(m_halMaxValues, halMaxValuesEx); |
| |
| // Assign a new tracker and record the tracker index |
| int ret = cmHalState->renderHal->trackerProducer.AssignNewTracker(); |
| CM_CHK_COND_RETURN((ret < 0), CM_FAILURE, "Error: failed to assign a new tracker"); |
| m_trackerIndex = ret; |
| ret = cmHalState->advExecutor->AssignNewTracker(); |
| CM_CHK_COND_RETURN((ret < 0), CM_FAILURE, "Error: failed to assign a new tracker"); |
| m_fastTrackerIndex = ret; |
| |
| // Creates or gets GPU Context for the test |
| if (m_queueOption.UserGPUContext == true) |
| { |
| // Checks if it is the user-provided GPU context. If it is valid, we will create the queue with the existing Context |
| if (cmHalState->osInterface->pfnIsGpuContextValid(cmHalState->osInterface, (MOS_GPU_CONTEXT)m_queueOption.GPUContext) != MOS_STATUS_SUCCESS) |
| { |
| // Returns failure |
| CM_ASSERTMESSAGE("Error: The user passed in an GPU context which is not valid"); |
| return CM_INVALID_USER_GPU_CONTEXT_FOR_QUEUE_EX; |
| } |
| } |
| else |
| { |
| MOS_GPUCTX_CREATOPTIONS ctxCreateOption; |
| ctxCreateOption.CmdBufferNumScale |
| = HalCm_GetNumCmdBuffers(cmHalState->osInterface, cmHalState->cmDeviceParam.maxTasks); |
| |
| // Create MDF preset GPU context, update GPUContext in m_queueOption |
| if (m_queueOption.QueueType == CM_QUEUE_TYPE_RENDER) |
| { |
| MOS_GPU_CONTEXT tmpGpuCtx = cmHalState->requestCustomGpuContext? MOS_GPU_CONTEXT_RENDER4: MOS_GPU_CONTEXT_RENDER3;; |
| |
| // check if context handle was specified by user. |
| if (m_queueOption.GPUContext != 0) |
| { |
| tmpGpuCtx = (MOS_GPU_CONTEXT)m_queueOption.GPUContext; |
| } |
| |
| // sanity check of context handle for CM |
| if (HalCm_IsValidGpuContext(tmpGpuCtx) == false) |
| { |
| return CM_INVALID_USER_GPU_CONTEXT_FOR_QUEUE_EX; |
| } |
| |
| // SSEU overriding |
| if (cmHalState->cmHalInterface->IsOverridePowerOptionPerGpuContext()) |
| { |
| // checking if need shutdown sub-slices for VME usage |
| if (m_queueOption.SseuUsageHint == CM_QUEUE_SSEU_USAGE_HINT_VME |
| && cmHalState->cmHalInterface->IsRequestShutdownSubslicesForVmeUsage()) |
| { |
| MEDIA_SYSTEM_INFO *gtSystemInfo = cmHalState->osInterface->pfnGetGtSystemInfo(cmHalState->osInterface); |
| ctxCreateOption.packed.SliceCount = (uint8_t)gtSystemInfo->SliceCount; |
| ctxCreateOption.packed.SubSliceCount = (gtSystemInfo->SubSliceCount / gtSystemInfo->SliceCount) >> 1; // set to half |
| ctxCreateOption.packed.MaxEUcountPerSubSlice = gtSystemInfo->EUCount/gtSystemInfo->SubSliceCount; |
| ctxCreateOption.packed.MinEUcountPerSubSlice = gtSystemInfo->EUCount/gtSystemInfo->SubSliceCount; |
| } |
| |
| #if (_DEBUG || _RELEASE_INTERNAL) |
| MOS_USER_FEATURE_VALUE_DATA UserFeatureData = {0}; |
| MOS_UserFeature_ReadValue_ID( |
| nullptr, |
| __MEDIA_USER_FEATURE_VALUE_SSEU_SETTING_OVERRIDE_ID, |
| &UserFeatureData); |
| |
| // +---------------+----------------+----------------+----------------+ |
| // | EUCountMax | EUCountMin | SSCount | SliceCount | |
| // +-------------24+--------------16+---------------8+---------------0+ |
| if (UserFeatureData.u32Data != 0xDEADC0DE) |
| { |
| ctxCreateOption.packed.SliceCount = UserFeatureData.u32Data & 0xFF; // Bits 0-7 |
| ctxCreateOption.packed.SubSliceCount = (UserFeatureData.u32Data >> 8) & 0xFF; // Bits 8-15 |
| ctxCreateOption.packed.MaxEUcountPerSubSlice = (UserFeatureData.u32Data >> 16) & 0xFF; // Bits 16-23 |
| ctxCreateOption.packed.MinEUcountPerSubSlice = (UserFeatureData.u32Data >> 24) & 0xFF; // Bits 24-31 |
| } |
| #endif |
| } |
| |
| ctxCreateOption.RAMode = m_queueOption.RAMode; |
| |
| // Create render GPU context. |
| CM_CHK_MOSSTATUS_GOTOFINISH_CMERROR( |
| CreateGpuContext(cmHalState, tmpGpuCtx, MOS_GPU_NODE_3D, |
| &ctxCreateOption)); |
| |
| #if (_RELEASE_INTERNAL || _DEBUG) |
| #if defined(CM_DIRECT_GUC_SUPPORT) |
| //init GuC |
| CM_CHK_MOSSTATUS_GOTOFINISH_CMERROR(cmHalState->osInterface->pfnInitGuC(cmHalState->osInterface, MOS_GPU_NODE_3D)); |
| #endif |
| #endif |
| m_queueOption.GPUContext = tmpGpuCtx; |
| } |
| else if (m_queueOption.QueueType == CM_QUEUE_TYPE_COMPUTE) |
| { |
| ctxCreateOption.RAMode = m_queueOption.RAMode; |
| |
| bool bVeUsedInCm = false; //need change to true once feature is done in future. |
| #if (_DEBUG || _RELEASE_INTERNAL) |
| MOS_USER_FEATURE_VALUE_DATA UserFeatureData = {0}; |
| MOS_UserFeature_ReadValue_ID(nullptr, |
| __MEDIA_USER_FEATURE_VALUE_MDF_CCS_USE_VE_INTERFACE, &UserFeatureData); |
| bVeUsedInCm = (UserFeatureData.u32Data == 0x1)? true: false; |
| #endif |
| Mos_SetVirtualEngineSupported(cmHalState->osInterface, bVeUsedInCm); |
| |
| if (cmHalState->osInterface->veDefaultEnable && cmHalState->osInterface->bSupportVirtualEngine) // check if VE enabled on OS |
| { |
| // prepare virtual egine hint param on this cm queue. |
| CM_CHK_MOSSTATUS_GOTOFINISH_CMERROR( |
| HalCm_PrepareVEHintParam(cmHalState, false, &m_mosVeHintParams)); |
| |
| m_usingVirtualEngine = true; |
| } |
| |
| CM_CHK_MOSSTATUS_GOTOFINISH_CMERROR( |
| CreateGpuContext(cmHalState, MOS_GPU_CONTEXT_CM_COMPUTE, |
| MOS_GPU_NODE_COMPUTE, &ctxCreateOption)); |
| m_queueOption.GPUContext = MOS_GPU_CONTEXT_CM_COMPUTE; |
| } |
| else |
| { |
| // Returns failure |
| CM_ASSERTMESSAGE("Error: The QueueType is not supported by MDF."); |
| return CM_NOT_IMPLEMENTED; |
| } |
| } |
| |
| finish: |
| return hr; |
| } |
| |
| //*----------------------------------------------------------------------------- |
| //| Purpose: Checks whether any kernels in the task have a thread argument |
| //| Returns: Result of the operation. |
| //*----------------------------------------------------------------------------- |
| int32_t CmQueueRT::GetTaskHasThreadArg(CmKernelRT* kernelArray[], uint32_t numKernels, bool& threadArgExists) |
| { |
| threadArgExists = false; |
| |
| for(uint32_t krn = 0; krn < numKernels; krn++) |
| { |
| if( !kernelArray[krn] ) |
| { |
| CM_ASSERTMESSAGE("Error: The kernel in the task have no thread argument."); |
| return CM_FAILURE; |
| } |
| |
| if( kernelArray[krn]->IsThreadArgExisted( ) ) |
| { |
| threadArgExists = true; |
| break; |
| } |
| } |
| |
| return CM_SUCCESS; |
| } |
| |
| //*----------------------------------------------------------------------------- |
| //| Purpose: Enqueue Task |
| //| Arguments : |
| //| kernelArray [in] Pointer to kernel array |
| //| event [in] Reference to the pointer to Event |
| //| threadSpace [out] Pointer to thread space |
| //| |
| //| Returns: Result of the operation. |
| //*----------------------------------------------------------------------------- |
| CM_RT_API int32_t CmQueueRT::Enqueue( |
| CmTask* kernelArray, |
| CmEvent* & event, |
| const CmThreadSpace* threadSpace) |
| { |
| INSERT_API_CALL_LOG(); |
| |
| if (kernelArray == nullptr) |
| { |
| CM_ASSERTMESSAGE("Error: Kernel array is null."); |
| return CM_INVALID_ARG_VALUE; |
| } |
| |
| CmTaskRT *kernelArrayRT = static_cast<CmTaskRT *>(kernelArray); |
| uint32_t kernelCount = 0; |
| kernelCount = kernelArrayRT->GetKernelCount(); |
| if (kernelCount == 0) |
| { |
| CM_ASSERTMESSAGE("Error: Invalid kernel count."); |
| return CM_FAILURE; |
| } |
| |
| if (kernelCount > m_halMaxValues->maxKernelsPerTask) |
| { |
| CM_ASSERTMESSAGE("Error: Kernel count exceeds max kernel per enqueue."); |
| return CM_EXCEED_MAX_KERNEL_PER_ENQUEUE; |
| } |
| |
| int32_t result; |
| const CmThreadSpaceRT *threadSpaceRTConst = static_cast<const CmThreadSpaceRT *>(threadSpace); |
| PCM_HAL_STATE cmHalState = ((PCM_CONTEXT_DATA)m_device->GetAccelData())->cmHalState; |
| if (cmHalState->cmHalInterface->CheckMediaModeAvailability() == false) |
| { |
| if (threadSpaceRTConst != nullptr) |
| { |
| result = EnqueueWithGroup(kernelArray, event, threadSpaceRTConst->GetThreadGroupSpace()); |
| } |
| else |
| { |
| // If there isn't any shared thread space or associated thread space, |
| // create a temporary (maxThreadCount x 1) thread group space whose |
| // size equal to the max thread count of kernel who doesn't have a |
| // thread space associated. |
| uint32_t maxThreadCount = 1; |
| bool usedCommonTGS = false; |
| for (uint32_t i = 0; i < kernelCount; i++) |
| { |
| CmKernelRT *tmpKernel = kernelArrayRT->GetKernelPointer(i); |
| CmThreadGroupSpace *tmpTGS = nullptr; |
| tmpKernel->GetThreadGroupSpace(tmpTGS); |
| |
| if (tmpTGS == nullptr) |
| { |
| usedCommonTGS = true; |
| uint32_t singleThreadCount = 0; |
| tmpKernel->GetThreadCount(singleThreadCount); |
| if (maxThreadCount < singleThreadCount) |
| { |
| maxThreadCount = singleThreadCount; |
| } |
| } |
| } |
| |
| CmThreadGroupSpace *threadGroupSpaceTemp = nullptr; |
| if (usedCommonTGS == true) |
| { |
| result = m_device->CreateThreadGroupSpace(1, 1, maxThreadCount, 1, threadGroupSpaceTemp); |
| if (result != CM_SUCCESS) |
| { |
| CM_ASSERTMESSAGE("Error: Creating temporary thread group space failure."); |
| return result; |
| } |
| } |
| |
| result = EnqueueWithGroup(kernelArray, event, threadGroupSpaceTemp); |
| |
| if (threadGroupSpaceTemp != nullptr) |
| { |
| m_device->DestroyThreadGroupSpace(threadGroupSpaceTemp); |
| } |
| } |
| return result; |
| } |
| |
| if (threadSpaceRTConst && threadSpaceRTConst->IsThreadAssociated()) |
| { |
| if (threadSpaceRTConst->GetNeedSetKernelPointer() && threadSpaceRTConst->KernelPointerIsNULL()) |
| { |
| CmKernelRT* tmp = nullptr; |
| tmp = kernelArrayRT->GetKernelPointer(0); |
| threadSpaceRTConst->SetKernelPointer(tmp); |
| } |
| } |
| |
| #if _DEBUG |
| if (threadSpaceRTConst) |
| { |
| CmThreadSpaceRT *threadSpaceRT = const_cast<CmThreadSpaceRT*>(threadSpaceRTConst); |
| if (!threadSpaceRT->IntegrityCheck(kernelArrayRT)) |
| { |
| CM_ASSERTMESSAGE("Error: Invalid thread space."); |
| return CM_INVALID_THREAD_SPACE; |
| } |
| } |
| #endif |
| |
| if(m_device->IsPrintEnable()) |
| { |
| m_device->ClearPrintBuffer(); |
| } |
| |
| typedef CmKernelRT* pCmKernel; |
| CmKernelRT** tmp = MOS_NewArray(pCmKernel, (kernelCount + 1)); |
| if(tmp == nullptr) |
| { |
| CM_ASSERTMESSAGE("Error: Out of system memory."); |
| return CM_OUT_OF_HOST_MEMORY; |
| } |
| |
| uint32_t totalThreadNumber = 0; |
| for(uint32_t i = 0; i < kernelCount; i++) |
| { |
| tmp[ i ] = kernelArrayRT->GetKernelPointer(i); |
| |
| uint32_t singleThreadNumber = 0; |
| tmp[i]->GetThreadCount(singleThreadNumber); |
| if (singleThreadNumber == 0) |
| { |
| CmThreadSpaceRT *threadSpaceRT = const_cast<CmThreadSpaceRT*>(threadSpaceRTConst); |
| if (threadSpaceRT) |
| { |
| uint32_t width, height; |
| threadSpaceRT->GetThreadSpaceSize(width, height); |
| singleThreadNumber = width*height; |
| } |
| } |
| totalThreadNumber += singleThreadNumber; |
| } |
| tmp[kernelCount ] = nullptr; |
| |
| CmEventRT *eventRT = static_cast<CmEventRT *>(event); |
| result = Enqueue_RT(tmp, kernelCount, totalThreadNumber, eventRT, threadSpaceRTConst, kernelArrayRT->GetSyncBitmap(), kernelArrayRT->GetPowerOption(), |
| kernelArrayRT->GetConditionalEndBitmap(), kernelArrayRT->GetConditionalEndInfo(), kernelArrayRT->GetTaskConfig()); |
| |
| if (eventRT) |
| { |
| eventRT->SetKernelNames(kernelArrayRT, const_cast<CmThreadSpaceRT*>(threadSpaceRTConst), nullptr); |
| } |
| |
| event = eventRT; |
| MosSafeDeleteArray( tmp ); |
| |
| return result; |
| } |
| |
| //*----------------------------------------------------------------------------- |
| //| Purpose: Enqueue Task |
| //| Arguments : |
| //| kernelArray [in] Pointer to kernel array |
| //| event [in] Reference to the pointer to Event |
| //| threadSpace [out] Pointer to thread space |
| //| |
| //| Returns: Result of the operation. |
| //*----------------------------------------------------------------------------- |
| int32_t CmQueueRT::Enqueue_RT( |
| CmKernelRT* kernelArray[], |
| const uint32_t kernelCount, |
| const uint32_t totalThreadCount, |
| CmEventRT* & event, |
| const CmThreadSpaceRT* threadSpace, |
| uint64_t syncBitmap, |
| PCM_POWER_OPTION powerOption, |
| uint64_t conditionalEndBitmap, |
| CM_HAL_CONDITIONAL_BB_END_INFO* conditionalEndInfo, |
| PCM_TASK_CONFIG taskConfig) |
| { |
| if(kernelArray == nullptr) |
| { |
| CM_ASSERTMESSAGE("Error: Kernel array is NULL."); |
| return CM_INVALID_ARG_VALUE; |
| } |
| |
| if( kernelCount == 0 ) |
| { |
| CM_ASSERTMESSAGE("Error: There are no valid kernels."); |
| return CM_INVALID_ARG_VALUE; |
| } |
| |
| bool isEventVisible = (event == CM_NO_EVENT)? false:true; |
| |
| CLock Locker(m_criticalSectionTaskInternal); |
| |
| // set the current tracker index in renderhal |
| PCM_CONTEXT_DATA cmData = (PCM_CONTEXT_DATA)m_device->GetAccelData(); |
| CM_CHK_NULL_RETURN_CMERROR(cmData); |
| CM_CHK_NULL_RETURN_CMERROR(cmData->cmHalState); |
| CM_CHK_NULL_RETURN_CMERROR(cmData->cmHalState->renderHal); |
| cmData->cmHalState->renderHal->currentTrackerIndex = m_trackerIndex; |
| |
| CmTaskInternal* task = nullptr; |
| int32_t result = CmTaskInternal::Create(kernelCount, totalThreadCount, kernelArray, threadSpace, m_device, syncBitmap, task, conditionalEndBitmap, conditionalEndInfo); |
| if( result != CM_SUCCESS ) |
| { |
| CM_ASSERTMESSAGE("Error: Create CM task internal failure."); |
| return result; |
| } |
| |
| LARGE_INTEGER nEnqueueTime; |
| if ( !(MOS_QueryPerformanceCounter( (uint64_t*)&nEnqueueTime.QuadPart ))) |
| { |
| CM_ASSERTMESSAGE("Error: Query performance counter failure."); |
| CmTaskInternal::Destroy(task); |
| return CM_FAILURE; |
| } |
| |
| int32_t taskDriverId = -1; |
| |
| result = CreateEvent(task, isEventVisible, taskDriverId, event); |
| if (result != CM_SUCCESS) |
| { |
| CM_ASSERTMESSAGE("Error: Create event failure."); |
| return result; |
| } |
| if ( event != nullptr ) |
| { |
| event->SetEnqueueTime( nEnqueueTime ); |
| } |
| |
| task->SetPowerOption( powerOption ); |
| |
| task->SetProperty(taskConfig); |
| |
| if( !m_enqueuedTasks.Push( task ) ) |
| { |
| CM_ASSERTMESSAGE("Error: Push enqueued tasks failure."); |
| return CM_FAILURE; |
| } |
| |
| result = FlushTaskWithoutSync(); |
| |
| return result; |
| } |
| |
| int32_t CmQueueRT::Enqueue_RT(CmKernelRT* kernelArray[], |
| const uint32_t kernelCount, |
| const uint32_t totalThreadCount, |
| CmEventRT* & event, |
| const CmThreadGroupSpace* threadGroupSpace, |
| uint64_t syncBitmap, |
| PCM_POWER_OPTION powerOption, |
| uint64_t conditionalEndBitmap, |
| CM_HAL_CONDITIONAL_BB_END_INFO* conditionalEndInfo, |
| PCM_TASK_CONFIG taskConfig, |
| const CM_EXECUTION_CONFIG* krnExecCfg) |
| { |
| if(kernelArray == nullptr) |
| { |
| CM_ASSERTMESSAGE("Error: Kernel array is NULL."); |
| return CM_INVALID_ARG_VALUE; |
| } |
| |
| if( kernelCount == 0 ) |
| { |
| CM_ASSERTMESSAGE("Error: There are no valid kernels."); |
| return CM_INVALID_ARG_VALUE; |
| } |
| |
| CLock Locker(m_criticalSectionTaskInternal); |
| |
| // set the current tracker index in renderhal |
| PCM_CONTEXT_DATA cmData = (PCM_CONTEXT_DATA)m_device->GetAccelData(); |
| CM_CHK_NULL_RETURN_CMERROR(cmData); |
| CM_CHK_NULL_RETURN_CMERROR(cmData->cmHalState); |
| CM_CHK_NULL_RETURN_CMERROR(cmData->cmHalState->renderHal); |
| cmData->cmHalState->renderHal->currentTrackerIndex = m_trackerIndex; |
| |
| CmTaskInternal* task = nullptr; |
| int32_t result = CmTaskInternal::Create( kernelCount, totalThreadCount, kernelArray, |
| threadGroupSpace, m_device, syncBitmap, task, |
| conditionalEndBitmap, conditionalEndInfo, krnExecCfg); |
| if( result != CM_SUCCESS ) |
| { |
| CM_ASSERTMESSAGE("Error: Create CmTaskInternal failure."); |
| return result; |
| } |
| |
| LARGE_INTEGER nEnqueueTime; |
| if ( !(MOS_QueryPerformanceCounter( (uint64_t*)&nEnqueueTime.QuadPart ))) |
| { |
| CM_ASSERTMESSAGE("Error: Query performance counter failure."); |
| CmTaskInternal::Destroy(task); |
| return CM_FAILURE; |
| } |
| |
| int32_t taskDriverId = -1; |
| result = CreateEvent(task, !(event == CM_NO_EVENT) , taskDriverId, event); |
| if (result != CM_SUCCESS) |
| { |
| CM_ASSERTMESSAGE("Error: Create event failure."); |
| return result; |
| } |
| if ( event != nullptr ) |
| { |
| event->SetEnqueueTime( nEnqueueTime ); |
| } |
| |
| task->SetPowerOption( powerOption ); |
| |
| task->SetProperty(taskConfig); |
| |
| if( !m_enqueuedTasks.Push( task ) ) |
| { |
| CM_ASSERTMESSAGE("Error: Push enqueued tasks failure.") |
| return CM_FAILURE; |
| } |
| |
| result = FlushTaskWithoutSync(); |
| |
| return result; |
| } |
| |
| int32_t CmQueueRT::Enqueue_RT( CmKernelRT* kernelArray[], |
| CmEventRT* & event, |
| uint32_t numTasksGenerated, |
| bool isLastTask, |
| uint32_t hints, |
| PCM_POWER_OPTION powerOption) |
| { |
| int32_t result = CM_FAILURE; |
| uint32_t kernelCount = 0; |
| CmTaskInternal* task = nullptr; |
| int32_t taskDriverId = -1; |
| bool isEventVisible = (event == CM_NO_EVENT) ? false:true; |
| bool threadArgExists = false; |
| |
| if( kernelArray == nullptr) |
| { |
| CM_ASSERTMESSAGE("Error: Kernel array is NULL."); |
| return CM_INVALID_ARG_VALUE; |
| } |
| while( kernelArray[ kernelCount ] ) |
| { |
| kernelCount++; |
| } |
| |
| if( kernelCount < CM_MINIMUM_NUM_KERNELS_ENQWHINTS ) |
| { |
| CM_ASSERTMESSAGE("Error: EnqueueWithHints requires at least 2 kernels."); |
| return CM_FAILURE; |
| } |
| |
| uint32_t totalThreadCount = 0; |
| for( uint32_t i = 0; i < kernelCount; i ++ ) |
| { |
| uint32_t threadCount = 0; |
| kernelArray[i]->GetThreadCount( threadCount ); |
| totalThreadCount += threadCount; |
| } |
| |
| if( GetTaskHasThreadArg(kernelArray, kernelCount, threadArgExists) != CM_SUCCESS ) |
| { |
| CM_ASSERTMESSAGE("Error: Thread argument checking fails."); |
| return CM_FAILURE; |
| } |
| |
| if( !threadArgExists ) |
| { |
| if (totalThreadCount > m_halMaxValues->maxUserThreadsPerTaskNoThreadArg ) |
| { |
| CM_ASSERTMESSAGE("Error: Maximum number of threads per task exceeded."); |
| return CM_EXCEED_MAX_THREAD_AMOUNT_PER_ENQUEUE; |
| } |
| } |
| else |
| { |
| if( totalThreadCount > m_halMaxValues->maxUserThreadsPerTask ) |
| { |
| CM_ASSERTMESSAGE("Error: Maximum number of threads per task exceeded."); |
| return CM_EXCEED_MAX_THREAD_AMOUNT_PER_ENQUEUE; |
| } |
| } |
| |
| CLock Locker(m_criticalSectionTaskInternal); |
| |
| // set the current tracker index in renderhal |
| PCM_CONTEXT_DATA cmData = (PCM_CONTEXT_DATA)m_device->GetAccelData(); |
| CM_CHK_NULL_RETURN_CMERROR(cmData); |
| CM_CHK_NULL_RETURN_CMERROR(cmData->cmHalState); |
| CM_CHK_NULL_RETURN_CMERROR(cmData->cmHalState->renderHal); |
| cmData->cmHalState->renderHal->currentTrackerIndex = m_trackerIndex; |
| |
| result = CmTaskInternal::Create( kernelCount, totalThreadCount, kernelArray, task, numTasksGenerated, isLastTask, hints, m_device ); |
| if( result != CM_SUCCESS ) |
| { |
| CM_ASSERTMESSAGE("Error: Create CM task internal failure."); |
| return result; |
| } |
| |
| LARGE_INTEGER nEnqueueTime; |
| if ( !(MOS_QueryPerformanceCounter( (uint64_t*)&nEnqueueTime.QuadPart )) ) |
| { |
| CM_ASSERTMESSAGE("Error: Query performance counter failure."); |
| CmTaskInternal::Destroy(task); |
| return CM_FAILURE; |
| } |
| |
| result = CreateEvent(task, isEventVisible, taskDriverId, event); |
| if (result != CM_SUCCESS) |
| { |
| CM_ASSERTMESSAGE("Error: Create event failure."); |
| return result; |
| } |
| if ( event != nullptr ) |
| { |
| event->SetEnqueueTime( nEnqueueTime ); |
| } |
| |
| for( uint32_t i = 0; i < kernelCount; ++i ) |
| { |
| CmKernelRT* kernel = nullptr; |
| task->GetKernel(i, kernel); |
| if( kernel != nullptr ) |
| { |
| kernel->SetAdjustedYCoord(0); |
| } |
| } |
| |
| task->SetPowerOption( powerOption ); |
| |
| if (!m_enqueuedTasks.Push(task)) |
| { |
| CM_ASSERTMESSAGE("Error: Push enqueued tasks failure.") |
| return CM_FAILURE; |
| } |
| |
| result = FlushTaskWithoutSync(); |
| |
| return result; |
| } |
| |
| //*----------------------------------------------------------------------------- |
| //! Function to enqueue task with thread group space pointer |
| //! Arguments: |
| //! 1. Pointer to CmTask, which can only contain one kernel. |
| //! 2. Reference to the pointer to CmEvent that is to be returned |
| //! 3. Pointer to a CmThreadGroupSpace. |
| //! Return Value: |
| //! CM_SUCCESS if the task is successfully enqueued and the CmEvent is generated |
| //! CM_OUT_OF_HOST_MEMORY if out of host memory |
| //! CM_FAILURE otherwise |
| //! Notes: |
| //! If the kernel has per thread arg, GPGPU object is to be used. |
| //! If the kernel has no per thread arg. GPGPU walker is used. |
| //*----------------------------------------------------------------------------- |
| CM_RT_API int32_t CmQueueRT::EnqueueWithGroup( CmTask* task, CmEvent* & event, const CmThreadGroupSpace* threadGroupSpace) |
| { |
| INSERT_API_CALL_LOG(); |
| |
| int32_t result; |
| |
| if(task == nullptr) |
| { |
| CM_ASSERTMESSAGE("Error: Kernel array is NULL."); |
| return CM_INVALID_ARG_VALUE; |
| } |
| |
| CmTaskRT *taskRT = static_cast<CmTaskRT *>(task); |
| uint32_t count = 0; |
| count = taskRT->GetKernelCount(); |
| |
| if( count == 0 ) |
| { |
| CM_ASSERTMESSAGE("Error: There are no valid kernels."); |
| return CM_FAILURE; |
| } |
| |
| if(m_device->IsPrintEnable()) |
| { |
| m_device->ClearPrintBuffer(); |
| } |
| |
| typedef CmKernelRT* pCmKernel; |
| CmKernelRT** tmp = MOS_NewArray(pCmKernel, (count+1)); |
| if(tmp == nullptr) |
| { |
| CM_ASSERTMESSAGE("Error: Out of system memory."); |
| return CM_OUT_OF_HOST_MEMORY; |
| } |
| |
| uint32_t totalThreadNumber = 0; |
| for(uint32_t i = 0; i < count; i++) |
| { |
| uint32_t singleThreadNumber = 0; |
| tmp[ i ] = taskRT->GetKernelPointer(i); |
| |
| //Thread arguments is not allowed in GPGPU_WALKER path |
| if(tmp[i]->IsThreadArgExisted()) |
| { |
| CM_ASSERTMESSAGE("Error: No thread Args allowed when using group space"); |
| MosSafeDeleteArray(tmp); |
| return CM_THREAD_ARG_NOT_ALLOWED; |
| } |
| |
| tmp[i]->GetThreadCount(singleThreadNumber); |
| totalThreadNumber += singleThreadNumber; |
| } |
| tmp[count ] = nullptr; |
| |
| CmEventRT *eventRT = static_cast<CmEventRT *>(event); |
| result = Enqueue_RT( tmp, count, totalThreadNumber, eventRT, |
| threadGroupSpace, taskRT->GetSyncBitmap(), |
| taskRT->GetPowerOption(), |
| taskRT->GetConditionalEndBitmap(), taskRT->GetConditionalEndInfo(), |
| taskRT->GetTaskConfig(), taskRT->GetKernelExecuteConfig()); |
| |
| if (eventRT) |
| { |
| eventRT->SetKernelNames(taskRT, nullptr, const_cast<CmThreadGroupSpace*>(threadGroupSpace)); |
| } |
| |
| event = eventRT; |
| MosSafeDeleteArray( tmp ); |
| |
| return result; |
| } |
| |
| CM_RT_API int32_t CmQueueRT::EnqueueWithHints( |
| CmTask* kernelArray, |
| CmEvent* & event, |
| uint32_t hints) |
| { |
| INSERT_API_CALL_LOG(); |
| |
| int32_t hr = CM_FAILURE; |
| uint32_t count = 0; |
| uint32_t index = 0; |
| CmKernelRT** kernels = nullptr; |
| uint32_t numTasks = 0; |
| bool splitTask = false; |
| bool lastTask = false; |
| uint32_t numTasksGenerated = 0; |
| CmEventRT *eventRT = static_cast<CmEventRT *>(event); |
| |
| if (kernelArray == nullptr) |
| { |
| return CM_INVALID_ARG_VALUE; |
| } |
| CmTaskRT *kernelArrayRT = static_cast<CmTaskRT *>(kernelArray); |
| count = kernelArrayRT->GetKernelCount(); |
| if( count == 0 ) |
| { |
| CM_ASSERTMESSAGE("Error: Invalid kernel count."); |
| hr = CM_FAILURE; |
| goto finish; |
| } |
| |
| if( count > m_halMaxValues->maxKernelsPerTask ) |
| { |
| CM_ASSERTMESSAGE("Error: Kernel count exceeds maximum kernel per enqueue."); |
| hr = CM_EXCEED_MAX_KERNEL_PER_ENQUEUE; |
| goto finish; |
| } |
| |
| for (uint32_t i = 0; i < count; ++i) |
| { |
| CmKernelRT* kernelTmp = nullptr; |
| CmThreadSpaceRT* threadSpaceTmp = nullptr; |
| kernelTmp = kernelArrayRT->GetKernelPointer(i); |
| CM_CHK_NULL_GOTOFINISH_CMERROR(kernelTmp); |
| kernelTmp->GetThreadSpace(threadSpaceTmp); |
| CM_CHK_NULL_GOTOFINISH_CMERROR(threadSpaceTmp); |
| if (threadSpaceTmp->GetNeedSetKernelPointer() && threadSpaceTmp->KernelPointerIsNULL()) |
| { |
| threadSpaceTmp->SetKernelPointer(kernelTmp); |
| } |
| } |
| |
| #if _DEBUG |
| if( !kernelArrayRT->IntegrityCheckKernelThreadspace() ) |
| { |
| CM_ASSERTMESSAGE("Error: Integrity check for kernel thread space failed."); |
| hr = CM_KERNEL_THREADSPACE_INTEGRITY_FAILED; |
| goto finish; |
| } |
| #endif |
| |
| numTasks = ( hints & CM_HINTS_MASK_NUM_TASKS ) >> CM_HINTS_NUM_BITS_TASK_POS; |
| if( numTasks > 1 ) |
| { |
| splitTask = true; |
| } |
| |
| if( m_device->IsPrintEnable() ) |
| { |
| m_device->ClearPrintBuffer(); |
| } |
| |
| kernels = MOS_NewArray(CmKernelRT*, (count + 1)); |
| CM_CHK_NULL_GOTOFINISH_CMERROR(kernels); |
| |
| do |
| { |
| for (index = 0; index < count; ++index) |
| { |
| kernels[ index ] = kernelArrayRT->GetKernelPointer( index ); |
| } |
| |
| kernels[ count ] = nullptr; |
| |
| if(splitTask) |
| { |
| if( numTasksGenerated == (numTasks - 1 ) ) |
| { |
| lastTask = true; |
| } |
| } |
| else |
| { |
| lastTask = true; |
| } |
| |
| CM_CHK_CMSTATUS_GOTOFINISH(Enqueue_RT( kernels, eventRT, numTasksGenerated, lastTask, hints, kernelArrayRT->GetPowerOption() )); |
| event = eventRT; |
| numTasksGenerated++; |
| |
| }while(numTasksGenerated < numTasks); |
| |
| finish: |
| MosSafeDeleteArray( kernels ); |
| |
| return hr; |
| } |
| |
| //*----------------------------------------------------------------------------- |
| //! Enqueue an task, which contains one pre-defined kernel to |
| //! copy from host memory to surface |
| //! This is a non-blocking call. i.e. it returns immediately without waiting for |
| //! GPU to finish the execution of the task. |
| //! A CmEvent is generated each time a task is enqueued. The CmEvent can |
| //! be used to check if the task finishs. |
| //! INPUT: |
| //! 1) Pointer to the CmSurface2D_RT as copy destination |
| //! 2) Pointer to the host memory as copy source |
| //! 3) Reference to the pointer to CMEvent |
| //! 4) A boolean value to indicate if or not to flush the queue after enqueue the task |
| //! by default the boolean value is TRUE. |
| //! OUTPUT: |
| //! CM_SUCCESS if the task is successfully enqueued and the CmEvent is generated; |
| //! CM_OUT_OF_HOST_MEMORY if out of host memery; |
| //! CM_FAILURE otherwise. |
| //! More error code is coming. |
| //*----------------------------------------------------------------------------- |
| CM_RT_API int32_t CmQueueRT::EnqueueCopyCPUToGPU( CmSurface2D* surface, const unsigned char* sysMem, CmEvent* & event ) |
| { |
| INSERT_API_CALL_LOG(); |
| |
| if (!m_device->HasGpuCopyKernel()) |
| { |
| return CM_NOT_IMPLEMENTED; |
| } |
| |
| CmSurface2DRT *surfaceRT = static_cast<CmSurface2DRT *>(surface); |
| return EnqueueCopyInternal(surfaceRT, (unsigned char*)sysMem, 0, 0, CM_FASTCOPY_CPU2GPU, CM_FASTCOPY_OPTION_NONBLOCKING, event); |
| } |
| |
| //*----------------------------------------------------------------------------- |
| //! Enqueue an task, which contains one pre-defined kernel to |
| //! copy from surface to host memory |
| //! This is a non-blocking call. i.e. it returns immediately without waiting for |
| //! GPU to finish the execution of the task. |
| //! A CmEvent is generated each time a task is enqueued. The CmEvent can |
| //! be used to check if the task finishs. |
| //! INPUT: |
| //! 1) Pointer to the CmSurface2D_RT as copy source |
| //! 2) Pointer to the host memory as copy destination |
| //! 3) Reference to the pointer to CMEvent |
| //! 4) A boolean value to indicate if or not to flush the queue after enqueue the task |
| //! by default the boolean value is TRUE. |
| //! OUTPUT: |
| //! CM_SUCCESS if the task is successfully enqueued and the CmEvent is generated; |
| //! CM_OUT_OF_HOST_MEMORY if out of host memery; |
| //! CM_FAILURE otherwise. |
| //! More error code is coming. |
| //*----------------------------------------------------------------------------- |
| CM_RT_API int32_t CmQueueRT::EnqueueCopyGPUToCPU( CmSurface2D* surface, unsigned char* sysMem, CmEvent* & event ) |
| { |
| INSERT_API_CALL_LOG(); |
| |
| if (!m_device->HasGpuCopyKernel()) |
| { |
| return CM_NOT_IMPLEMENTED; |
| } |
| |
| CmSurface2DRT *surfaceRT = static_cast<CmSurface2DRT *>(surface); |
| return EnqueueCopyInternal(surfaceRT, sysMem, 0, 0, CM_FASTCOPY_GPU2CPU, CM_FASTCOPY_OPTION_NONBLOCKING, event); |
| } |
| |
| int32_t CmQueueRT::EnqueueUnalignedCopyInternal( CmSurface2DRT* surface, unsigned char* sysMem, const uint32_t widthStride, const uint32_t heightStride, CM_GPUCOPY_DIRECTION direction) |
| { |
| int32_t hr = CM_SUCCESS; |
| uint32_t bufferupSize = 0; |
| uint32_t dstAddShiftOffset = 0; |
| uint32_t threadWidth = 0; |
| uint32_t threadHeight = 0; |
| uint32_t threadNum = 0; |
| uint32_t auxiliaryBufferupSize = 0; |
| uint32_t width = 0; |
| uint32_t height = 0; |
| uint32_t sizePerPixel = 0; |
| uint32_t widthByte = 0; |
| uint32_t copyWidthByte = 0; |
| uint32_t copyHeightRow = 0; |
| uint32_t strideInBytes = widthStride; |
| uint32_t heightStrideInRows = heightStride; |
| size_t linearAddress = (size_t)sysMem; |
| size_t linearAddressAligned = 0; |
| unsigned char* hybridCopyAuxSysMem = nullptr; |
| |
| CmBufferUP *bufferUP = nullptr; |
| CmKernel *kernel = nullptr; |
| CmBufferUP *hybridCopyAuxBufferUP = nullptr; |
| SurfaceIndex *bufferIndexCM = nullptr; |
| SurfaceIndex *hybridCopyAuxIndexCM = nullptr; |
| SurfaceIndex *surf2DIndexCM = nullptr; |
| CmThreadSpace *threadSpace = nullptr; |
| CmTask *gpuCopyTask = nullptr; |
| CmProgram *gpuCopyProgram = nullptr; |
| CmEvent *event = nullptr; |
| CM_STATUS status; |
| CM_SURFACE_FORMAT format; |
| |
| if ( surface ) |
| { |
| CM_CHK_CMSTATUS_GOTOFINISH( surface->GetSurfaceDesc(width, height, format, sizePerPixel)); |
| } |
| else |
| { |
| return CM_FAILURE; |
| } |
| |
| widthByte = width * sizePerPixel; |
| // the actual copy region |
| copyWidthByte = MOS_MIN(strideInBytes, widthByte); |
| copyHeightRow = MOS_MIN(heightStrideInRows, height); |
| |
| if(linearAddress == 0) |
| { |
| CM_ASSERTMESSAGE("Error: Pointer to system memory is null."); |
| return CM_INVALID_ARG_VALUE; |
| } |
| if( (copyWidthByte > CM_MAX_THREADSPACE_WIDTH_FOR_MW * BLOCK_WIDTH ) || ( copyHeightRow > CM_MAX_THREADSPACE_HEIGHT_FOR_MW * BLOCK_HEIGHT) ) |
| { // each thread handles 64x8 block data. This API will fail if it exceeds the max thread space's size |
| CM_ASSERTMESSAGE("Error: Invalid copy size."); |
| return CM_INVALID_ARG_SIZE; |
| } |
| |
| if (sizeof (void *) == 8 ) //64-bit |
| { |
| linearAddressAligned = linearAddress & ADDRESS_PAGE_ALIGNMENT_MASK_X64; |
| } |
| else //32-bit |
| { |
| linearAddressAligned = linearAddress & ADDRESS_PAGE_ALIGNMENT_MASK_X86; |
| } |
| //Calculate Left Shift offset |
| dstAddShiftOffset = (uint32_t)(linearAddress - linearAddressAligned); |
| |
| if (format == CM_SURFACE_FORMAT_NV12 || format == CM_SURFACE_FORMAT_P010 || format == CM_SURFACE_FORMAT_P016) |
| { |
| bufferupSize = MOS_ALIGN_CEIL(strideInBytes * (heightStrideInRows + copyHeightRow * 1/2) + (uint32_t)dstAddShiftOffset , 64); |
| } |
| else |
| { |
| bufferupSize = MOS_ALIGN_CEIL(strideInBytes * heightStrideInRows + (uint32_t)dstAddShiftOffset, 64); |
| } |
| |
| CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateBufferUP(bufferupSize, ( void * )linearAddressAligned, bufferUP)); |
| CM_CHK_CMSTATUS_GOTOFINISH(bufferUP->GetIndex(bufferIndexCM)); |
| CM_CHK_CMSTATUS_GOTOFINISH(surface->GetIndex(surf2DIndexCM)); |
| |
| CM_CHK_CMSTATUS_GOTOFINISH( m_device->LoadPredefinedCopyKernel(gpuCopyProgram)); |
| CM_CHK_NULL_GOTOFINISH_CMERROR(gpuCopyProgram); |
| |
| if (direction == CM_FASTCOPY_CPU2GPU) |
| { |
| if (format == CM_SURFACE_FORMAT_NV12 || format == CM_SURFACE_FORMAT_P010 || format == CM_SURFACE_FORMAT_P016) |
| { |
| CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateKernel(gpuCopyProgram, _NAME(surfaceCopy_write_unaligned_NV12), kernel, "PredefinedGPUCopyKernel")); |
| } |
| else |
| { |
| CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateKernel(gpuCopyProgram, _NAME(surfaceCopy_write_unaligned), kernel, "PredefinedGPUCopyKernel")); |
| |
| } |
| CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 0, sizeof( SurfaceIndex ), bufferIndexCM )); |
| CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 1, sizeof( SurfaceIndex ), surf2DIndexCM )); |
| } |
| else |
| { |
| if (format == CM_SURFACE_FORMAT_NV12 || format == CM_SURFACE_FORMAT_P010 || format == CM_SURFACE_FORMAT_P016) |
| { |
| CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateKernel(gpuCopyProgram, _NAME(surfaceCopy_read_unaligned_NV12), kernel, "PredefinedGPUCopyKernel")); |
| auxiliaryBufferupSize = BLOCK_WIDTH * 2 * (heightStrideInRows + copyHeightRow * 1/2); |
| } |
| else |
| { |
| CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateKernel(gpuCopyProgram, _NAME(surfaceCopy_read_unaligned), kernel, "PredefinedGPUCopyKernel")); |
| auxiliaryBufferupSize = BLOCK_WIDTH * 2 * heightStrideInRows; |
| } |
| hybridCopyAuxSysMem = (unsigned char*)MOS_AlignedAllocMemory(auxiliaryBufferupSize, PAGE_ALIGNED); |
| if(!hybridCopyAuxSysMem) |
| { |
| CM_ASSERTMESSAGE("Error: Out of system memory."); |
| return CM_OUT_OF_HOST_MEMORY; |
| } |
| CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateBufferUP(auxiliaryBufferupSize, (void*)hybridCopyAuxSysMem, hybridCopyAuxBufferUP)); |
| CM_CHK_CMSTATUS_GOTOFINISH(hybridCopyAuxBufferUP->GetIndex(hybridCopyAuxIndexCM)); |
| |
| CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 0, sizeof( SurfaceIndex ), surf2DIndexCM )); |
| CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 1, sizeof( SurfaceIndex ), bufferIndexCM )); |
| CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 5, sizeof( uint32_t ), ©WidthByte )); |
| CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 6, sizeof( SurfaceIndex ), hybridCopyAuxIndexCM )); |
| } |
| |
| CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 2, sizeof( uint32_t ), &strideInBytes )); |
| CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 3, sizeof( uint32_t ), &heightStrideInRows )); |
| CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 4, sizeof( uint32_t ), &dstAddShiftOffset )); |
| |
| threadWidth = ( uint32_t )ceil( ( double )copyWidthByte/BLOCK_WIDTH ); |
| threadHeight = ( uint32_t )ceil( ( double )copyHeightRow/BLOCK_HEIGHT ); |
| |
| threadNum = threadWidth * threadHeight; |
| CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetThreadCount( threadNum )); |
| |
| CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateThreadSpace( threadWidth, threadHeight, threadSpace )); |
| CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateTask(gpuCopyTask)); |
| CM_CHK_CMSTATUS_GOTOFINISH(gpuCopyTask->AddKernel( kernel )); |
| CM_CHK_CMSTATUS_GOTOFINISH(EnqueueFast(gpuCopyTask, event, threadSpace)); |
| |
| if(event) |
| { |
| CM_CHK_CMSTATUS_GOTOFINISH(event->GetStatus(status)); |
| while(status != CM_STATUS_FINISHED) |
| { |
| if (status == CM_STATUS_RESET) |
| { |
| hr = CM_TASK_MEDIA_RESET; |
| goto finish; |
| } |
| CM_CHK_CMSTATUS_GOTOFINISH(event->GetStatus(status)); |
| } |
| } |
| // CPU copy unaligned data |
| if( direction == CM_FASTCOPY_GPU2CPU) |
| { |
| uint32_t readOffset = 0; |
| uint32_t copyLines = 0; |
| unsigned char* startBuffer = (unsigned char*)linearAddressAligned; |
| |
| copyLines = (format == CM_SURFACE_FORMAT_NV12 || format == CM_SURFACE_FORMAT_P010 || format == CM_SURFACE_FORMAT_P016) ? heightStrideInRows + MOS_MIN(heightStrideInRows, height) * 1 / 2 : heightStrideInRows; |
| |
| for(uint32_t i = 0; i < copyLines; ++i) |
| { |
| //copy begining of line |
| size_t beginLineWriteOffset = strideInBytes * i + dstAddShiftOffset; |
| uint32_t mod = ((uintptr_t)startBuffer + beginLineWriteOffset) < BLOCK_WIDTH ? ((uintptr_t)startBuffer + beginLineWriteOffset) : ((uintptr_t)startBuffer + beginLineWriteOffset) & (BLOCK_WIDTH - 1); |
| uint32_t beginLineCopySize = (mod == 0) ? 0:(BLOCK_WIDTH - mod); |
| //fix copy size for cases where the surface width is small |
| if((beginLineCopySize > widthByte) || ( beginLineCopySize == 0 && widthByte < BLOCK_WIDTH ) ) |
| { |
| beginLineCopySize = widthByte; |
| } |
| if(beginLineCopySize > 0) |
| { |
| CmSafeMemCopy((void *)( (unsigned char *)startBuffer + beginLineWriteOffset), (void *)(hybridCopyAuxSysMem + readOffset), beginLineCopySize); |
| } |
| |
| //copy end of line |
| uint32_t alignedWrites = (copyWidthByte - beginLineCopySize) &~ (BLOCK_WIDTH - 1); |
| uint32_t endLineWriteOffset = beginLineWriteOffset + alignedWrites + beginLineCopySize; |
| uint32_t endLineCopySize = dstAddShiftOffset+ i * strideInBytes + copyWidthByte - endLineWriteOffset; |
| if(endLineCopySize > 0 && endLineWriteOffset > beginLineWriteOffset) |
| { |
| CmSafeMemCopy((void *)((unsigned char *)startBuffer + endLineWriteOffset), (void *)(hybridCopyAuxSysMem + readOffset + BLOCK_WIDTH), endLineCopySize); |
| } |
| readOffset += (BLOCK_WIDTH * 2); |
| } |
| } |
| |
| CM_CHK_CMSTATUS_GOTOFINISH(DestroyEventFast(event)); |
| CM_CHK_CMSTATUS_GOTOFINISH(m_device->DestroyTask(gpuCopyTask)); |
| CM_CHK_CMSTATUS_GOTOFINISH(m_device->DestroyThreadSpace(threadSpace)); |
| CM_CHK_CMSTATUS_GOTOFINISH(m_device->DestroyBufferUP(bufferUP)); |
| if (direction == CM_FASTCOPY_GPU2CPU) |
| { |
| if(hybridCopyAuxBufferUP) |
| { |
| CM_CHK_CMSTATUS_GOTOFINISH(m_device->DestroyBufferUP(hybridCopyAuxBufferUP)); |
| } |
| if(hybridCopyAuxSysMem) |
| { |
| MOS_AlignedFreeMemory(hybridCopyAuxSysMem); |
| hybridCopyAuxSysMem = nullptr; |
| } |
| } |
| finish: |
| if(hr != CM_SUCCESS) |
| { |
| if(bufferUP == nullptr) |
| { |
| // user need to know whether the failure is caused by out of BufferUP. |
| hr = CM_GPUCOPY_OUT_OF_RESOURCE; |
| } |
| |
| if(event) DestroyEventFast(event); |
| if(kernel) m_device->DestroyKernel(kernel); |
| if(threadSpace) m_device->DestroyThreadSpace(threadSpace); |
| if(gpuCopyTask) m_device->DestroyTask(gpuCopyTask); |
| if(bufferUP) m_device->DestroyBufferUP(bufferUP); |
| if(hybridCopyAuxBufferUP) m_device->DestroyBufferUP(hybridCopyAuxBufferUP); |
| if(hybridCopyAuxSysMem) {MOS_AlignedFreeMemory(hybridCopyAuxSysMem); hybridCopyAuxSysMem = nullptr;} |
| } |
| |
| return hr; |
| } |
| //*----------------------------------------------------------------------------- |
| //! Enqueue an task, which contains one pre-defined kernel to |
| //! copy from surface to host memory or from host memory to surface |
| //! This is a non-blocking call. i.e. it returns immediately without waiting for |
| //! GPU to finish the execution of the task. |
| //! A CmEvent is generated each time a task is enqueued. The CmEvent can |
| //! be used to check if the task finishes. |
| //! INPUT: |
| //! 1) Pointer to the CmSurface2D |
| //! 2) Pointer to the host memory |
| //! 3) Width stride in bytes, if there is no padding in system memroy, it is set to zero. |
| //! 4) Height stride in row, if there is no padding in system memroy, it is set to zero. |
| //! 4) Copy direction, cpu->gpu (linear->tiled) or gpu->cpu(tiled->linear) |
| //! 5) Reference to the pointer to CMEvent |
| //! OUTPUT: |
| //! CM_SUCCESS if the task is successfully enqueued and the CmEvent is generated; |
| //! CM_OUT_OF_HOST_MEMORY if out of host memery; |
| //! CM_FAILURE otherwise. |
| //*----------------------------------------------------------------------------- |
| int32_t CmQueueRT::EnqueueCopyInternal(CmSurface2DRT* surface, |
| unsigned char* sysMem, |
| const uint32_t widthStride, |
| const uint32_t heightStride, |
| CM_GPUCOPY_DIRECTION direction, |
| const uint32_t option, |
| CmEvent* & event) |
| { |
| int32_t hr = CM_FAILURE; |
| uint32_t width = 0; |
| uint32_t height = 0; |
| uint32_t sizePerPixel = 0; |
| CM_SURFACE_FORMAT format = CM_SURFACE_FORMAT_INVALID; |
| |
| if (surface) |
| { |
| CM_CHK_CMSTATUS_GOTOFINISH(surface->GetSurfaceDesc(width, height, format, sizePerPixel)); |
| } |
| else |
| { |
| return CM_GPUCOPY_INVALID_SURFACES; |
| } |
| |
| if (format == CM_SURFACE_FORMAT_NV12 || format == CM_SURFACE_FORMAT_P010 || format == CM_SURFACE_FORMAT_P016) |
| { |
| hr = EnqueueCopyInternal_2Planes(surface, (unsigned char*)sysMem, format, width, widthStride, height, heightStride, sizePerPixel, direction, option, event); |
| } |
| else |
| { |
| hr = EnqueueCopyInternal_1Plane(surface, (unsigned char*)sysMem, format, width, widthStride, height, heightStride, sizePerPixel, direction, option, event); |
| } |
| |
| finish: |
| return hr; |
| } |
| |
| int32_t CmQueueRT::EnqueueCopyInternal_1Plane(CmSurface2DRT* surface, |
| unsigned char* sysMem, |
| CM_SURFACE_FORMAT format, |
| const uint32_t widthInPixel, |
| const uint32_t widthStride, |
| const uint32_t heightInRow, |
| const uint32_t heightStride, |
| const uint32_t sizePerPixel, |
| CM_GPUCOPY_DIRECTION direction, |
| const uint32_t option, |
| CmEvent* & event ) |
| { |
| int32_t hr = CM_SUCCESS; |
| uint32_t tempHeight = heightInRow; |
| uint32_t strideInBytes = widthStride; |
| uint32_t strideInDwords = 0; |
| uint32_t heightStrideInRows = heightStride; |
| uint32_t addedShiftLeftOffset = 0; |
| size_t linearAddress = (size_t)sysMem; |
| size_t linearAddressAligned = 0; |
| |
| CmKernel *kernel = nullptr; |
| CmBufferUP *cmbufferUP = nullptr; |
| SurfaceIndex *bufferIndexCM = nullptr; |
| SurfaceIndex *surf2DIndexCM = nullptr; |
| CmThreadSpace *threadSpace = nullptr; |
| CmTask *gpuCopyTask = nullptr; |
| CmEvent *internalEvent = nullptr; |
| |
| uint32_t threadWidth = 0; |
| uint32_t threadHeight = 0; |
| uint32_t threadNum = 0; |
| uint32_t widthDword = 0; |
| uint32_t widthByte = 0; |
| uint32_t copyWidthByte = 0; |
| uint32_t copyHeightRow = 0; |
| uint32_t sliceCopyHeightRow = 0; |
| uint32_t sliceCopyBufferUPSize = 0; |
| int32_t totalBufferUPSize = 0; |
| uint32_t startX = 0; |
| uint32_t startY = 0; |
| bool blSingleEnqueue = true; |
| CM_GPUCOPY_KERNEL *gpuCopyKernelParam = nullptr; |
| |
| PCM_HAL_STATE cmHalState = \ |
| ((PCM_CONTEXT_DATA)m_device->GetAccelData())->cmHalState; |
| |
| widthByte = widthInPixel * sizePerPixel; |
| |
| //Align the width regarding stride |
| if(strideInBytes == 0) |
| { |
| strideInBytes = widthByte; |
| } |
| |
| if(heightStrideInRows == 0) |
| { |
| heightStrideInRows = heightInRow; |
| } |
| |
| // the actual copy region |
| copyWidthByte = MOS_MIN(strideInBytes, widthByte); |
| copyHeightRow = MOS_MIN(heightStrideInRows, heightInRow); |
| |
| // Make sure stride and start address of system memory is 16-byte aligned. |
| // if no padding in system memory , strideInBytes = widthByte. |
| if(strideInBytes & 0xf) |
| { |
| CM_ASSERTMESSAGE("Error: Stride is not 16-byte aligned."); |
| return CM_GPUCOPY_INVALID_STRIDE; |
| } |
| if((linearAddress & 0xf) || (linearAddress == 0)) |
| { |
| CM_ASSERTMESSAGE("Error: Start address of system memory is not 16-byte aligned."); |
| return CM_GPUCOPY_INVALID_SYSMEM; |
| } |
| |
| //Calculate actual total size of system memory |
| totalBufferUPSize = strideInBytes * heightStrideInRows; |
| |
| //Check thread space width here |
| if( copyWidthByte > CM_MAX_THREADSPACE_WIDTH_FOR_MW * BLOCK_PIXEL_WIDTH *4 ) |
| { // each thread handles 128x8 block data. This API will fail if it exceeds the max thread space's size |
| CM_ASSERTMESSAGE("Error: Invalid copy size."); |
| return CM_GPUCOPY_INVALID_SIZE; |
| } |
| |
| while (totalBufferUPSize > 0) |
| { |
| if (sizeof (void *) == 8 ) //64-bit |
| { |
| linearAddressAligned = linearAddress & ADDRESS_PAGE_ALIGNMENT_MASK_X64; |
| } |
| else //32-bit |
| { |
| linearAddressAligned = linearAddress & ADDRESS_PAGE_ALIGNMENT_MASK_X86; |
| } |
| |
| //Calculate Left Shift offset |
| addedShiftLeftOffset = (uint32_t)(linearAddress - linearAddressAligned); |
| totalBufferUPSize += addedShiftLeftOffset; |
| |
| if (totalBufferUPSize > CM_MAX_1D_SURF_WIDTH) |
| { |
| blSingleEnqueue = false; |
| sliceCopyHeightRow = ((CM_MAX_1D_SURF_WIDTH - addedShiftLeftOffset)/(strideInBytes*(BLOCK_HEIGHT * INNER_LOOP))) * (BLOCK_HEIGHT * INNER_LOOP); |
| sliceCopyBufferUPSize = sliceCopyHeightRow * strideInBytes + addedShiftLeftOffset; |
| tempHeight = sliceCopyHeightRow; |
| } |
| else |
| { |
| sliceCopyHeightRow = copyHeightRow; |
| sliceCopyBufferUPSize = totalBufferUPSize; |
| if (!blSingleEnqueue) |
| { |
| tempHeight = sliceCopyHeightRow; |
| } |
| } |
| |
| //Check thread space height here |
| if(sliceCopyHeightRow > CM_MAX_THREADSPACE_HEIGHT_FOR_MW * BLOCK_HEIGHT * INNER_LOOP ) |
| { // each thread handles 128x8 block data. This API will fail if it exceeds the max thread space's size |
| CM_ASSERTMESSAGE("Error: Invalid copy size."); |
| return CM_GPUCOPY_INVALID_SIZE; |
| } |
| |
| kernel = nullptr; |
| CM_CHK_CMSTATUS_GOTOFINISH( m_device->CreateBufferUP( sliceCopyBufferUPSize, ( void * )linearAddressAligned, cmbufferUP )); |
| CM_CHK_NULL_GOTOFINISH_CMERROR(cmbufferUP); |
| |
| //Configure memory object control for BufferUP to solve the cache-line issue. |
| if (cmHalState->cmHalInterface->IsGPUCopySurfaceNoCacheWARequired()) |
| { |
| CM_CHK_CMSTATUS_GOTOFINISH(cmbufferUP->SelectMemoryObjectControlSetting(MEMORY_OBJECT_CONTROL_SKL_NO_LLC_L3)); |
| } |
| CM_CHK_CMSTATUS_GOTOFINISH(CreateGPUCopyKernel(copyWidthByte, sliceCopyHeightRow, format, direction, gpuCopyKernelParam)); |
| CM_CHK_NULL_GOTOFINISH_CMERROR(gpuCopyKernelParam); |
| kernel = gpuCopyKernelParam->kernel; |
| |
| CM_CHK_NULL_GOTOFINISH_CMERROR(kernel); |
| |
| CM_CHK_NULL_GOTOFINISH_CMERROR(cmbufferUP); |
| CM_CHK_CMSTATUS_GOTOFINISH(cmbufferUP->GetIndex( bufferIndexCM )); |
| CM_CHK_CMSTATUS_GOTOFINISH(surface->GetIndex( surf2DIndexCM )); |
| |
| threadWidth = ( uint32_t )ceil( ( double )copyWidthByte/BLOCK_PIXEL_WIDTH/4 ); |
| threadHeight = ( uint32_t )ceil( ( double )sliceCopyHeightRow/BLOCK_HEIGHT/INNER_LOOP ); |
| threadNum = threadWidth * threadHeight; |
| CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetThreadCount( threadNum )); |
| CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateThreadSpace( threadWidth, threadHeight, threadSpace )); |
| |
| if(direction == CM_FASTCOPY_GPU2CPU) |
| { |
| surface->SetReadSyncFlag(true, this); // GPU -> CPU, set surf2d as read sync flag |
| } |
| |
| if( direction == CM_FASTCOPY_CPU2GPU) |
| { |
| if (cmHalState->cmHalInterface->IsSurfaceCompressionWARequired()) |
| { |
| CM_CHK_CMSTATUS_GOTOFINISH(surface->SetCompressionMode(MEMCOMP_DISABLED)); |
| } |
| CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 0, sizeof( SurfaceIndex ), bufferIndexCM) ); |
| CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 1, sizeof( SurfaceIndex ), surf2DIndexCM )); |
| } |
| else |
| { |
| CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 1, sizeof( SurfaceIndex ), bufferIndexCM )); |
| CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 0, sizeof( SurfaceIndex ), surf2DIndexCM )); |
| } |
| |
| |
| widthDword = (uint32_t)ceil((double)widthByte / 4); |
| strideInDwords = (uint32_t)ceil((double)strideInBytes / 4); |
| |
| CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 2, sizeof( uint32_t ), &strideInDwords )); |
| CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 3, sizeof( uint32_t ), &heightStrideInRows )); |
| CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 4, sizeof( uint32_t ), &addedShiftLeftOffset )); |
| CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 5, sizeof( uint32_t ), &threadHeight )); |
| |
| if (direction == CM_FASTCOPY_GPU2CPU) //GPU-->CPU, read |
| { |
| CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 6, sizeof( uint32_t ), &widthDword )); |
| CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 7, sizeof( uint32_t ), &tempHeight )); |
| CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 8, sizeof(uint32_t), &startX)); |
| CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 9, sizeof(uint32_t), &startY)); |
| } |
| else //CPU-->GPU, write |
| { |
| //this only works for the kernel surfaceCopy_write_32x32 |
| CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 6, sizeof( uint32_t ), &startX )); |
| CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 7, sizeof( uint32_t ), &startY )); |
| } |
| |
| CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateTask(gpuCopyTask)); |
| CM_CHK_CMSTATUS_GOTOFINISH(gpuCopyTask->AddKernel( kernel )); |
| if (option & CM_FASTCOPY_OPTION_DISABLE_TURBO_BOOST) |
| { |
| // disable turbo |
| CM_TASK_CONFIG taskConfig; |
| CmSafeMemSet(&taskConfig, 0, sizeof(CM_TASK_CONFIG)); |
| taskConfig.turboBoostFlag = CM_TURBO_BOOST_DISABLE; |
| gpuCopyTask->SetProperty(taskConfig); |
| } |
| CM_CHK_CMSTATUS_GOTOFINISH(EnqueueFast(gpuCopyTask, internalEvent, |
| threadSpace)); |
| |
| GPUCOPY_KERNEL_UNLOCK(gpuCopyKernelParam); |
| |
| //update for next slice |
| linearAddress += sliceCopyBufferUPSize - addedShiftLeftOffset; |
| totalBufferUPSize -= sliceCopyBufferUPSize; |
| copyHeightRow -= sliceCopyHeightRow; |
| startX = 0; |
| startY += sliceCopyHeightRow; |
| |
| if(totalBufferUPSize > 0) //Intermediate event, we don't need it |
| { |
| CM_CHK_CMSTATUS_GOTOFINISH(DestroyEventFast(internalEvent)); |
| } |
| else //Last one event, need keep or destroy it |
| { |
| if ((option & CM_FASTCOPY_OPTION_BLOCKING) && (internalEvent)) |
| { |
| CM_CHK_CMSTATUS_GOTOFINISH(internalEvent->WaitForTaskFinished()); |
| } |
| |
| if(event == CM_NO_EVENT) //User doesn't need CmEvent for this copy |
| { |
| event = nullptr; |
| CM_CHK_CMSTATUS_GOTOFINISH(DestroyEventFast(internalEvent)); |
| } |
| else //User needs this CmEvent |
| { |
| event = internalEvent; |
| } |
| } |
| |
| CM_CHK_CMSTATUS_GOTOFINISH(m_device->DestroyTask(gpuCopyTask)); |
| CM_CHK_CMSTATUS_GOTOFINISH(m_device->DestroyThreadSpace(threadSpace)); |
| CM_CHK_CMSTATUS_GOTOFINISH(m_device->DestroyBufferUP(cmbufferUP)); |
| } |
| |
| finish: |
| |
| if(hr != CM_SUCCESS) |
| { |
| if(cmbufferUP == nullptr) |
| { |
| // user need to know whether the failure is caused by out of BufferUP. |
| hr = CM_GPUCOPY_OUT_OF_RESOURCE; |
| } |
| |
| if(kernel && gpuCopyKernelParam) GPUCOPY_KERNEL_UNLOCK(gpuCopyKernelParam); |
| if(threadSpace) m_device->DestroyThreadSpace(threadSpace); |
| if(gpuCopyTask) m_device->DestroyTask(gpuCopyTask); |
| if(cmbufferUP) m_device->DestroyBufferUP(cmbufferUP); |
| if(internalEvent) DestroyEventFast(internalEvent); |
| |
| // CM_FAILURE for all the other errors |
| // return CM_EXCEED_MAX_TIMEOUT to notify app that gpu reset happens |
| if( hr != CM_GPUCOPY_OUT_OF_RESOURCE && hr != CM_EXCEED_MAX_TIMEOUT) |
| { |
| hr = CM_FAILURE; |
| } |
| } |
| |
| return hr; |
| } |
| |
| int32_t CmQueueRT::EnqueueCopyInternal_2Planes(CmSurface2DRT* surface, |
| unsigned char* sysMem, |
| CM_SURFACE_FORMAT format, |
| const uint32_t widthInPixel, |
| const uint32_t widthStride, |
| const uint32_t heightInRow, |
| const uint32_t heightStride, |
| const uint32_t sizePerPixel, |
| CM_GPUCOPY_DIRECTION direction, |
| const uint32_t option, |
| CmEvent* & event) |
| { |
| int32_t hr = CM_SUCCESS; |
| uint32_t strideInBytes = widthStride; |
| uint32_t strideInDwords = 0; |
| uint32_t heightStrideInRows = heightStride; |
| size_t linearAddressY = 0; |
| size_t linearAddressUV = 0; |
| size_t linearAddressAlignedY = 0; |
| size_t linearAddressAlignedUV = 0; |
| uint32_t addedShiftLeftOffsetY = 0; |
| uint32_t addedShiftLeftOffsetUV = 0; |
| |
| CmKernel *kernel = nullptr; |
| CmBufferUP *cmbufferUPY = nullptr; |
| CmBufferUP *cmbufferUPUV = nullptr; |
| SurfaceIndex *bufferUPIndexY = nullptr; |
| SurfaceIndex *bufferUPIndexUV = nullptr; |
| SurfaceIndex *surf2DIndexCM = nullptr; |
| CmThreadSpace *threadSpace = nullptr; |
| CmTask *gpuCopyTask = nullptr; |
| CmEvent *internalEvent = nullptr; |
| |
| uint32_t threadWidth = 0; |
| uint32_t threadHeight = 0; |
| uint32_t threadNum = 0; |
| uint32_t widthDword = 0; |
| uint32_t widthByte = 0; |
| uint32_t copyWidthByte = 0; |
| uint32_t copyHeightRow = 0; |
| uint32_t bufferUPYSize = 0; |
| uint32_t bufferUPUVSize = 0; |
| |
| CM_GPUCOPY_KERNEL *gpuCopyKernelParam = nullptr; |
| PCM_HAL_STATE cmHalState = \ |
| ((PCM_CONTEXT_DATA)m_device->GetAccelData())->cmHalState; |
| |
| widthByte = widthInPixel * sizePerPixel; |
| |
| //Align the width regarding stride |
| if (strideInBytes == 0) |
| { |
| strideInBytes = widthByte; |
| } |
| |
| if (heightStrideInRows == 0) |
| { |
| heightStrideInRows = heightInRow; |
| } |
| |
| // the actual copy region |
| copyWidthByte = MOS_MIN(strideInBytes, widthByte); |
| copyHeightRow = MOS_MIN(heightStrideInRows, heightInRow); |
| |
| // Make sure stride and start address of system memory is 16-byte aligned. |
| // if no padding in system memory , strideInBytes = widthByte. |
| if (strideInBytes & 0xf) |
| { |
| CM_ASSERTMESSAGE("Error: Stride is not 16-byte aligned."); |
| return CM_GPUCOPY_INVALID_STRIDE; |
| } |
| |
| //Check thread space width here |
| if (copyWidthByte > CM_MAX_THREADSPACE_WIDTH_FOR_MW * BLOCK_PIXEL_WIDTH * 4) |
| { // each thread handles 128x8 block data. This API will fail if it exceeds the max thread space's size |
| CM_ASSERTMESSAGE("Error: Invalid copy size."); |
| return CM_GPUCOPY_INVALID_SIZE; |
| } |
| |
| linearAddressY = (size_t)sysMem; |
| linearAddressUV = (size_t)((char*)sysMem + strideInBytes * heightStrideInRows); |
| |
| if ((linearAddressY & 0xf) || (linearAddressY == 0) || (linearAddressAlignedUV & 0xf)) |
| { |
| CM_ASSERTMESSAGE("Error: Start address of system memory is not 16-byte aligned."); |
| return CM_GPUCOPY_INVALID_SYSMEM; |
| } |
| |
| if (sizeof (void *) == 8) //64-bit |
| { |
| linearAddressAlignedY = linearAddressY & ADDRESS_PAGE_ALIGNMENT_MASK_X64; |
| linearAddressAlignedUV = linearAddressUV & ADDRESS_PAGE_ALIGNMENT_MASK_X64; |
| } |
| else //32-bit |
| { |
| linearAddressAlignedY = linearAddressY & ADDRESS_PAGE_ALIGNMENT_MASK_X86; |
| linearAddressAlignedUV = linearAddressUV & ADDRESS_PAGE_ALIGNMENT_MASK_X86; |
| } |
| |
| //Calculate Left Shift offset |
| addedShiftLeftOffsetY = (uint32_t)(linearAddressY - linearAddressAlignedY); |
| addedShiftLeftOffsetUV = (uint32_t)(linearAddressUV - linearAddressAlignedUV); |
| |
| //Calculate actual total size of system memory, assume it's NV12/P010/P016 formats |
| bufferUPYSize = strideInBytes * heightStrideInRows + addedShiftLeftOffsetY; |
| bufferUPUVSize = strideInBytes * copyHeightRow * 1 / 2 + addedShiftLeftOffsetUV; |
| |
| //Check thread space height here |
| if (copyHeightRow > CM_MAX_THREADSPACE_HEIGHT_FOR_MW * BLOCK_HEIGHT * INNER_LOOP) |
| { // each thread handles 128x8 block data. This API will fail if it exceeds the max thread space's size |
| CM_ASSERTMESSAGE("Error: Invalid copy size."); |
| return CM_GPUCOPY_INVALID_SIZE; |
| } |
| |
| kernel = nullptr; |
| CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateBufferUP(bufferUPYSize, (void *)linearAddressAlignedY, cmbufferUPY)); |
| CM_CHK_NULL_GOTOFINISH_CMERROR(cmbufferUPY); |
| CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateBufferUP(bufferUPUVSize, (void *)linearAddressAlignedUV, cmbufferUPUV)); |
| CM_CHK_NULL_GOTOFINISH_CMERROR(cmbufferUPUV); |
| |
| //Configure memory object control for the two BufferUP to solve the same cache-line coherency issue. |
| if (cmHalState->cmHalInterface->IsGPUCopySurfaceNoCacheWARequired()) |
| { |
| CM_CHK_CMSTATUS_GOTOFINISH(cmbufferUPY->SelectMemoryObjectControlSetting(MEMORY_OBJECT_CONTROL_SKL_NO_LLC_L3)); |
| CM_CHK_CMSTATUS_GOTOFINISH(cmbufferUPUV->SelectMemoryObjectControlSetting(MEMORY_OBJECT_CONTROL_SKL_NO_LLC_L3)); |
| } |
| else |
| { |
| CM_CHK_CMSTATUS_GOTOFINISH(static_cast< CmBuffer_RT* >(cmbufferUPY)->SetMemoryObjectControl(MEMORY_OBJECT_CONTROL_FROM_GTT_ENTRY, CM_WRITE_THROUGH, 0)); |
| CM_CHK_CMSTATUS_GOTOFINISH(static_cast< CmBuffer_RT* >(cmbufferUPUV)->SetMemoryObjectControl(MEMORY_OBJECT_CONTROL_FROM_GTT_ENTRY, CM_WRITE_THROUGH, 0)); |
| } |
| |
| CM_CHK_CMSTATUS_GOTOFINISH(CreateGPUCopyKernel(copyWidthByte, copyHeightRow, format, direction, gpuCopyKernelParam)); |
| CM_CHK_NULL_GOTOFINISH_CMERROR(gpuCopyKernelParam); |
| kernel = gpuCopyKernelParam->kernel; |
| |
| CM_CHK_NULL_GOTOFINISH_CMERROR(kernel); |
| |
| CM_CHK_NULL_GOTOFINISH_CMERROR(cmbufferUPY); |
| CM_CHK_NULL_GOTOFINISH_CMERROR(cmbufferUPUV); |
| CM_CHK_CMSTATUS_GOTOFINISH(cmbufferUPY->GetIndex(bufferUPIndexY)); |
| CM_CHK_CMSTATUS_GOTOFINISH(cmbufferUPUV->GetIndex(bufferUPIndexUV)); |
| CM_CHK_CMSTATUS_GOTOFINISH(surface->GetIndex(surf2DIndexCM)); |
| |
| threadWidth = (uint32_t)ceil((double)copyWidthByte / BLOCK_PIXEL_WIDTH / 4); |
| threadHeight = (uint32_t)ceil((double)copyHeightRow / BLOCK_HEIGHT / INNER_LOOP); |
| threadNum = threadWidth * threadHeight; |
| CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetThreadCount(threadNum)); |
| CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateThreadSpace(threadWidth, threadHeight, threadSpace)); |
| |
| widthDword = (uint32_t)ceil((double)widthByte / 4); |
| strideInDwords = (uint32_t)ceil((double)strideInBytes / 4); |
| |
| if (direction == CM_FASTCOPY_CPU2GPU) //Write |
| { |
| //Input BufferUP_Y and BufferUP_UV |
| if (cmHalState->cmHalInterface->IsSurfaceCompressionWARequired()) |
| { |
| CM_CHK_CMSTATUS_GOTOFINISH(surface->SetCompressionMode(MEMCOMP_DISABLED)); |
| } |
| CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg(0, sizeof(SurfaceIndex), bufferUPIndexY)); |
| CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg(1, sizeof(SurfaceIndex), bufferUPIndexUV)); |
| //Output Surface2D |
| CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg(2, sizeof(SurfaceIndex), surf2DIndexCM)); |
| //Other parameters |
| CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg(3, sizeof(uint32_t), &strideInDwords)); |
| CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg(4, sizeof(uint32_t), &heightStrideInRows)); |
| CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg(5, sizeof(uint32_t), &addedShiftLeftOffsetY)); |
| CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg(6, sizeof(uint32_t), &addedShiftLeftOffsetUV)); |
| CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg(7, sizeof(uint32_t), &threadHeight)); |
| } |
| else //Read |
| { |
| //Input Surface2D |
| CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg(0, sizeof(SurfaceIndex), surf2DIndexCM)); |
| //Output BufferUP_Y and BufferUP_UV |
| CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg(1, sizeof(SurfaceIndex), bufferUPIndexY)); |
| CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg(2, sizeof(SurfaceIndex), bufferUPIndexUV)); |
| //Other parameters |
| CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg(3, sizeof(uint32_t), &strideInDwords)); |
| CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg(4, sizeof(uint32_t), &heightStrideInRows)); |
| CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg(5, sizeof(uint32_t), &addedShiftLeftOffsetY)); |
| CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg(6, sizeof(uint32_t), &addedShiftLeftOffsetUV)); |
| CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg(7, sizeof(uint32_t), &threadHeight)); |
| CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg(8, sizeof(uint32_t), &widthDword)); |
| CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg(9, sizeof(uint32_t), &heightInRow)); |
| |
| surface->SetReadSyncFlag(true, this); // GPU -> CPU, set surf2d as read sync flag |
| } |
| |
| CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateTask(gpuCopyTask)); |
| CM_CHK_CMSTATUS_GOTOFINISH(gpuCopyTask->AddKernel(kernel)); |
| if (option & CM_FASTCOPY_OPTION_DISABLE_TURBO_BOOST) |
| { |
| // disable turbo |
| CM_TASK_CONFIG taskConfig; |
| CmSafeMemSet(&taskConfig, 0, sizeof(CM_TASK_CONFIG)); |
| taskConfig.turboBoostFlag = CM_TURBO_BOOST_DISABLE; |
| gpuCopyTask->SetProperty(taskConfig); |
| } |
| CM_CHK_CMSTATUS_GOTOFINISH(EnqueueFast(gpuCopyTask, internalEvent, |
| threadSpace)); |
| |
| GPUCOPY_KERNEL_UNLOCK(gpuCopyKernelParam); |
| |
| if ((option & CM_FASTCOPY_OPTION_BLOCKING) && (internalEvent)) |
| { |
| CM_CHK_CMSTATUS_GOTOFINISH(internalEvent->WaitForTaskFinished()); |
| } |
| |
| if (event == CM_NO_EVENT) //User doesn't need CmEvent for this copy |
| { |
| event = nullptr; |
| CM_CHK_CMSTATUS_GOTOFINISH(DestroyEventFast(internalEvent)); |
| } |
| else //User needs this CmEvent |
| { |
| event = internalEvent; |
| } |
| |
| CM_CHK_CMSTATUS_GOTOFINISH(m_device->DestroyTask(gpuCopyTask)); |
| CM_CHK_CMSTATUS_GOTOFINISH(m_device->DestroyThreadSpace(threadSpace)); |
| CM_CHK_CMSTATUS_GOTOFINISH(m_device->DestroyBufferUP(cmbufferUPY)); |
| CM_CHK_CMSTATUS_GOTOFINISH(m_device->DestroyBufferUP(cmbufferUPUV)); |
| |
| finish: |
| |
| if (hr != CM_SUCCESS) |
| { |
| if ((cmbufferUPY == nullptr) || (cmbufferUPUV == nullptr)) |
| { |
| // user need to know whether the failure is caused by out of BufferUP. |
| hr = CM_GPUCOPY_OUT_OF_RESOURCE; |
| } |
| |
| if (kernel && gpuCopyKernelParam) GPUCOPY_KERNEL_UNLOCK(gpuCopyKernelParam); |
| if (threadSpace) m_device->DestroyThreadSpace(threadSpace); |
| if (gpuCopyTask) m_device->DestroyTask(gpuCopyTask); |
| if (cmbufferUPY) m_device->DestroyBufferUP(cmbufferUPY); |
| if (cmbufferUPUV) m_device->DestroyBufferUP(cmbufferUPUV); |
| if (internalEvent) DestroyEventFast(internalEvent); |
| |
| // CM_FAILURE for all the other errors |
| // return CM_EXCEED_MAX_TIMEOUT to notify app that gpu reset happens |
| if( hr != CM_GPUCOPY_OUT_OF_RESOURCE && hr != CM_EXCEED_MAX_TIMEOUT) |
| { |
| hr = CM_FAILURE; |
| } |
| } |
| |
| return hr; |
| } |
| |
| //*----------------------------------------------------------------------------- |
| //! Enqueue an task, which contains one pre-defined kernel to copy from video memory to video memory |
| //! This is a non-blocking call. i.e. it returns immediately without waiting for |
| //! GPU to finish the execution of the task. |
| //! A CmEvent is generated each time a task is enqueued. The CmEvent can |
| //! be used to check if the task finishes. |
| //! INPUT: |
| //! 1) Pointer to the CmSurface2D as copy destination |
| //! 2) Pointer to the CmSurface2D as copy source |
| //! 3) Option passed from user, blocking copy, non-blocking copy or disable turbo boost |
| //! 4) Reference to the pointer to CMEvent |
| //! OUTPUT: |
| //! CM_SUCCESS if the task is successfully enqueued and the CmEvent is generated; |
| //! CM_OUT_OF_HOST_MEMORY if out of host memery; |
| //! CM_GPUCOPY_INVALID_SURFACES if input/output surfaces' width/format are different or |
| //! input surface's height is larger than output surface's |
| //! Restrictions: |
| //! 1) Surface's width should be 64-byte aligned. |
| //! 2) The input surface's width/height/format should be the same as output surface's. |
| //*----------------------------------------------------------------------------- |
| CM_RT_API int32_t CmQueueRT::EnqueueCopyGPUToGPU( CmSurface2D* outputSurface, CmSurface2D* inputSurface, uint32_t option, CmEvent* & event ) |
| { |
| INSERT_API_CALL_LOG(); |
| |
| if (!m_device->HasGpuCopyKernel()) |
| { |
| return CM_NOT_IMPLEMENTED; |
| } |
| |
| uint32_t srcSurfaceWidth = 0; |
| uint32_t srcSurfaceHeight = 0; |
| uint32_t dstSurfaceWidth = 0; |
| uint32_t dstSurfaceHeight = 0; |
| |
| CM_SURFACE_FORMAT srcSurfaceFormat = CM_SURFACE_FORMAT_INVALID; |
| CM_SURFACE_FORMAT dstSurfaceFormat = CM_SURFACE_FORMAT_INVALID; |
| |
| int32_t hr = CM_SUCCESS; |
| uint32_t srcSizePerPixel = 0; |
| uint32_t dstSizePerPixel = 0; |
| uint32_t threadWidth = 0; |
| uint32_t threadHeight = 0; |
| |
| CmKernel *kernel = nullptr; |
| SurfaceIndex *surfaceInputIndex = nullptr; |
| SurfaceIndex *surfaceOutputIndex = nullptr; |
| CmThreadSpace *threadSpace = nullptr; |
| CmTask *task = nullptr; |
| uint32_t srcSurfAlignedWidthInBytes = 0; |
| CM_GPUCOPY_KERNEL *gpuCopyKernelParam = nullptr; |
| |
| if ((outputSurface == nullptr) || (inputSurface == nullptr)) |
| { |
| CM_ASSERTMESSAGE("Error: Pointer to input surface or output surface is null."); |
| return CM_FAILURE; |
| } |
| |
| PCM_HAL_STATE cmHalState = ((PCM_CONTEXT_DATA)m_device->GetAccelData())->cmHalState; |
| CmSurface2DRT *outputSurfaceRT = static_cast<CmSurface2DRT *>(outputSurface); |
| CmSurface2DRT *inputSurfaceRT = static_cast<CmSurface2DRT *>(inputSurface); |
| if (cmHalState->cmHalInterface->IsSurfaceCompressionWARequired()) |
| { |
| CM_CHK_CMSTATUS_GOTOFINISH(outputSurfaceRT->SetCompressionMode(MEMCOMP_DISABLED)); |
| } |
| |
| CM_CHK_CMSTATUS_GOTOFINISH(outputSurfaceRT->GetSurfaceDesc(dstSurfaceWidth, dstSurfaceHeight, dstSurfaceFormat, dstSizePerPixel)); |
| CM_CHK_CMSTATUS_GOTOFINISH(inputSurfaceRT->GetSurfaceDesc(srcSurfaceWidth, srcSurfaceHeight, srcSurfaceFormat, srcSizePerPixel)); |
| |
| if ((dstSurfaceWidth != srcSurfaceWidth) || |
| (dstSurfaceHeight < srcSurfaceHeight) || //relax the restriction |
| (dstSizePerPixel != srcSizePerPixel)) |
| { |
| CM_ASSERTMESSAGE("Error: Size of dest surface does not match src surface."); |
| return CM_GPUCOPY_INVALID_SURFACES; |
| } |
| |
| //To support copy b/w Format_A8R8G8B8 and Format_A8B8G8R8 |
| if (dstSurfaceFormat != srcSurfaceFormat) |
| { |
| if (!((dstSurfaceFormat == CM_SURFACE_FORMAT_A8R8G8B8) && (srcSurfaceFormat == CM_SURFACE_FORMAT_A8B8G8R8)) && |
| !((dstSurfaceFormat == CM_SURFACE_FORMAT_A8R8G8B8) && (srcSurfaceFormat == CM_SURFACE_FORMAT_A8B8G8R8))) |
| { |
| CM_ASSERTMESSAGE("Error: Only support copy b/w Format_A8R8G8B8 and Format_A8B8G8R8 if src format is not matched with dst format."); |
| return CM_GPUCOPY_INVALID_SURFACES; |
| } |
| } |
| |
| // 128Bytes aligned |
| srcSurfAlignedWidthInBytes = (uint32_t)(ceil((double)srcSurfaceWidth*srcSizePerPixel / BLOCK_PIXEL_WIDTH / 4) * (BLOCK_PIXEL_WIDTH * 4)); |
| |
| if (srcSurfaceHeight > CM_MAX_THREADSPACE_WIDTH_FOR_MW *BLOCK_HEIGHT *INNER_LOOP) |
| { |
| CM_ASSERTMESSAGE("Error: Invalid copy size."); |
| return CM_GPUCOPY_INVALID_SIZE; |
| } |
| |
| CM_CHK_CMSTATUS_GOTOFINISH(CreateGPUCopyKernel(srcSurfaceWidth*srcSizePerPixel, srcSurfaceHeight, srcSurfaceFormat, CM_FASTCOPY_GPU2GPU, gpuCopyKernelParam)); |
| CM_CHK_NULL_GOTOFINISH_CMERROR(gpuCopyKernelParam); |
| |
| CM_CHK_NULL_GOTOFINISH_CMERROR(gpuCopyKernelParam->kernel); |
| kernel = gpuCopyKernelParam->kernel; |
| |
| CM_CHK_CMSTATUS_GOTOFINISH(inputSurface->GetIndex(surfaceInputIndex)); |
| CM_CHK_CMSTATUS_GOTOFINISH(outputSurface->GetIndex(surfaceOutputIndex)); |
| |
| threadWidth = srcSurfAlignedWidthInBytes / (BLOCK_PIXEL_WIDTH * 4); |
| threadHeight = (uint32_t)ceil((double)srcSurfaceHeight / BLOCK_HEIGHT / INNER_LOOP); |
| |
| CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetThreadCount(threadWidth * threadHeight)); |
| |
| CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg(0, sizeof(SurfaceIndex), surfaceInputIndex)); |
| CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg(1, sizeof(SurfaceIndex), surfaceOutputIndex)); |
| CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg(2, sizeof(uint32_t), &threadHeight)); |
| |
| CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateThreadSpace(threadWidth, threadHeight, threadSpace)); |
| |
| CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateTask(task)); |
| CM_CHK_NULL_GOTOFINISH_CMERROR(task); |
| CM_CHK_CMSTATUS_GOTOFINISH(task->AddKernel(kernel)); |
| |
| if (option & CM_FASTCOPY_OPTION_DISABLE_TURBO_BOOST) |
| { |
| // disable turbo |
| CM_TASK_CONFIG taskConfig; |
| CmSafeMemSet(&taskConfig, 0, sizeof(CM_TASK_CONFIG)); |
| taskConfig.turboBoostFlag = CM_TURBO_BOOST_DISABLE; |
| task->SetProperty(taskConfig); |
| } |
| |
| CM_CHK_CMSTATUS_GOTOFINISH(EnqueueFast(task, event, threadSpace)); |
| if ((option & CM_FASTCOPY_OPTION_BLOCKING) && (event)) |
| { |
| CM_CHK_CMSTATUS_GOTOFINISH(event->WaitForTaskFinished()); |
| } |
| |
| finish: |
| |
| if (kernel && gpuCopyKernelParam) GPUCOPY_KERNEL_UNLOCK(gpuCopyKernelParam); |
| if (threadSpace) m_device->DestroyThreadSpace(threadSpace); |
| if (task) m_device->DestroyTask(task); |
| |
| return hr; |
| } |
| |
| //*----------------------------------------------------------------------------- |
| //! Enqueue an task, which contains one pre-defined kernel to copy from system memory to system memory |
| //! This is a non-blocking call. i.e. it returns immediately without waiting for |
| //! GPU to finish the execution of the task. |
| //! A CmEvent is generated each time a task is enqueued. The CmEvent can be used to check if the task finishs. |
| //! If the size is less than 1KB, CPU is used to do the copy and event will be set as nullptr . |
| //! |
| //! INPUT: |
| //! 1) Pointer to the system memory as copy destination |
| //! 2) Pointer to the system memory as copy source |
| //! 3) The size in bytes of memory be copied. |
| //! 4) Option passed from user, blocking copy, non-blocking copy or disable turbo boost |
| //! 5) Reference to the pointer to CMEvent |
| //! OUTPUT: |
| //! CM_SUCCESS if the task is successfully enqueued and the CmEvent is generated; |
| //! CM_OUT_OF_HOST_MEMORY if out of host memery; |
| //! CM_GPUCOPY_INVALID_SYSMEM if the sysMem is not 16-byte aligned or is NULL. |
| //! CM_GPUCOPY_OUT_OF_RESOURCE if runtime run out of BufferUP. |
| //! CM_GPUCOPY_INVALID_SIZE if its size plus shift-left offset large than CM_MAX_1D_SURF_WIDTH. |
| //! Restrictions: |
| //! 1) dstSysMem and srcSysMem should be 16-byte aligned. |
| //*----------------------------------------------------------------------------- |
| CM_RT_API int32_t CmQueueRT::EnqueueCopyCPUToCPU( unsigned char* dstSysMem, unsigned char* srcSysMem, uint32_t size, uint32_t option, CmEvent* & event ) |
| { |
| INSERT_API_CALL_LOG(); |
| |
| if (!m_device->HasGpuCopyKernel()) |
| { |
| return CM_NOT_IMPLEMENTED; |
| } |
| |
| int hr = CM_SUCCESS; |
| size_t inputLinearAddress = (size_t )srcSysMem; |
| size_t outputLinearAddress = (size_t )dstSysMem; |
| |
| size_t inputLinearAddressAligned = 0; |
| size_t outputLinearAddressAligned = 0; |
| |
| CmBufferUP *surfaceInput = nullptr; |
| CmBufferUP *surfaceOutput = nullptr; |
| CmKernel *kernel = nullptr; |
| SurfaceIndex *surfaceInputIndex = nullptr; |
| SurfaceIndex *surfaceOutputIndex = nullptr; |
| CmThreadSpace *threadSpace = nullptr; |
| CmTask *task = nullptr; |
| |
| int32_t srcLeftShiftOffset = 0; |
| int32_t dstLeftShiftOffset = 0; |
| uint32_t threadWidth = 0; |
| uint32_t threadHeight = 0; |
| uint32_t threadNum = 0; |
| uint32_t gpuMemcopySize = 0; |
| uint32_t cpuMemcopySize = 0; |
| CM_GPUCOPY_KERNEL *gpuCopyKernelParam = nullptr; |
| |
| if((inputLinearAddress & 0xf) || (outputLinearAddress & 0xf) || |
| (inputLinearAddress == 0) || (outputLinearAddress == 0)) |
| { |
| CM_ASSERTMESSAGE("Error: Start address of system memory is not 16-byte aligned."); |
| return CM_GPUCOPY_INVALID_SYSMEM; |
| } |
| |
| // Get page aligned address |
| if (sizeof (void *) == 8 ) //64-bit |
| { |
| inputLinearAddressAligned = inputLinearAddress & ADDRESS_PAGE_ALIGNMENT_MASK_X64; // make sure the address page aligned. |
| outputLinearAddressAligned = outputLinearAddress & ADDRESS_PAGE_ALIGNMENT_MASK_X64; // make sure the address page aligned. |
| } |
| else |
| { |
| inputLinearAddressAligned = inputLinearAddress & ADDRESS_PAGE_ALIGNMENT_MASK_X86; // make sure the address page aligned. |
| outputLinearAddressAligned = outputLinearAddress & ADDRESS_PAGE_ALIGNMENT_MASK_X86; // make sure the address page aligned. |
| } |
| |
| srcLeftShiftOffset = (int32_t)(inputLinearAddress - inputLinearAddressAligned) ; |
| dstLeftShiftOffset = (int32_t)(outputLinearAddress - outputLinearAddressAligned) ; |
| |
| if(((size + srcLeftShiftOffset) > CM_MAX_1D_SURF_WIDTH)|| |
| ((size + dstLeftShiftOffset) > CM_MAX_1D_SURF_WIDTH)) |
| { |
| CM_ASSERTMESSAGE("Error: Invalid copy size."); |
| return CM_GPUCOPY_INVALID_SIZE; |
| } |
| |
| threadWidth = 0; |
| threadHeight = 0; |
| threadNum = size / BYTE_COPY_ONE_THREAD; // each thread copys 32 x 4 x32 bytes = 1K |
| |
| if( threadNum == 0) |
| { |
| //if the size of data is less than data copied per thread ( 4K), use CPU to copy it instead of GPU. |
| CmFastMemCopy((void *)(outputLinearAddress), |
| (void *)(inputLinearAddress), |
| size); //SSE copy used in CMRT. |
| |
| event = nullptr; |
| return CM_SUCCESS; |
| } |
| |
| //Calculate proper thread space's width and height |
| threadWidth = 1; |
| threadHeight = threadNum/threadWidth; |
| while((threadHeight > CM_MAX_THREADSPACE_HEIGHT_FOR_MW)) |
| { |
| if(threadWidth > CM_MAX_THREADSPACE_WIDTH_FOR_MW) |
| { |
| hr = CM_GPUCOPY_INVALID_SIZE; // thread number exceed 511*511 |
| goto finish; |
| } |
| else if (threadWidth == 1) |
| { |
| threadWidth = THREAD_SPACE_WIDTH_INCREMENT; // first time, |
| threadHeight = threadNum/threadWidth; |
| } |
| else |
| { |
| threadWidth += THREAD_SPACE_WIDTH_INCREMENT; // increase 8 per iteration |
| threadHeight = threadNum/threadWidth; |
| } |
| } |
| |
| CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateBufferUP(size + srcLeftShiftOffset, (void *)inputLinearAddressAligned,surfaceInput)); |
| |
| CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateBufferUP(size + dstLeftShiftOffset, (void *)outputLinearAddressAligned,surfaceOutput)); |
| |
| CM_CHK_CMSTATUS_GOTOFINISH(CreateGPUCopyKernel(size, 0, CM_SURFACE_FORMAT_INVALID, CM_FASTCOPY_CPU2CPU, gpuCopyKernelParam)); |
| CM_CHK_NULL_GOTOFINISH_CMERROR(gpuCopyKernelParam); |
| CM_CHK_NULL_GOTOFINISH_CMERROR(gpuCopyKernelParam->kernel); |
| kernel = gpuCopyKernelParam->kernel; |
| |
| CM_CHK_NULL_GOTOFINISH_CMERROR(surfaceInput); |
| CM_CHK_CMSTATUS_GOTOFINISH(surfaceInput->GetIndex(surfaceInputIndex)); |
| CM_CHK_NULL_GOTOFINISH_CMERROR(surfaceOutput); |
| CM_CHK_CMSTATUS_GOTOFINISH(surfaceOutput->GetIndex(surfaceOutputIndex)); |
| |
| CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetThreadCount(threadWidth * threadHeight)); |
| CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 0, sizeof( SurfaceIndex ), surfaceInputIndex )); |
| CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 1, sizeof( SurfaceIndex ), surfaceOutputIndex )); |
| CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 2, sizeof( int ), &threadWidth )); |
| CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 3, sizeof( int ), &threadHeight )); |
| CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 4, sizeof( int ), &srcLeftShiftOffset )); |
| CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 5, sizeof( int ), &dstLeftShiftOffset )); |
| CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 6, sizeof( int ), &size )); |
| |
| CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateThreadSpace(threadWidth, threadHeight, threadSpace)); |
| |
| CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateTask(task)); |
| CM_CHK_NULL_GOTOFINISH_CMERROR(task); |
| CM_CHK_CMSTATUS_GOTOFINISH(task->AddKernel (kernel)); |
| |
| if (option & CM_FASTCOPY_OPTION_DISABLE_TURBO_BOOST) |
| { |
| // disable turbo |
| CM_TASK_CONFIG taskConfig; |
| CmSafeMemSet(&taskConfig, 0, sizeof(CM_TASK_CONFIG)); |
| taskConfig.turboBoostFlag = CM_TURBO_BOOST_DISABLE; |
| task->SetProperty(taskConfig); |
| } |
| |
| CM_CHK_CMSTATUS_GOTOFINISH(EnqueueFast(task, event, threadSpace)); |
| |
| if ((option & CM_FASTCOPY_OPTION_BLOCKING) && (event)) |
| { |
| CM_CHK_CMSTATUS_GOTOFINISH(event->WaitForTaskFinished()); |
| } |
| |
| //Copy the unaligned part by using CPU |
| gpuMemcopySize = threadHeight * threadWidth *BYTE_COPY_ONE_THREAD; |
| cpuMemcopySize = size - threadHeight * threadWidth *BYTE_COPY_ONE_THREAD; |
| |
| CmFastMemCopy((void *)(outputLinearAddress+gpuMemcopySize), |
| (void *)(inputLinearAddress+gpuMemcopySize), |
| cpuMemcopySize); //SSE copy used in CMRT. |
| |
| CM_CHK_CMSTATUS_GOTOFINISH(m_device->DestroyThreadSpace(threadSpace)); |
| CM_CHK_CMSTATUS_GOTOFINISH(m_device->DestroyTask(task)); |
| CM_CHK_CMSTATUS_GOTOFINISH(m_device->DestroyBufferUP(surfaceOutput)); // ref_cnf to guarantee task finish before BufferUP being really destroy. |
| CM_CHK_CMSTATUS_GOTOFINISH(m_device->DestroyBufferUP(surfaceInput)); |
| |
| GPUCOPY_KERNEL_UNLOCK(gpuCopyKernelParam); |
| |
| finish: |
| if(hr != CM_SUCCESS) |
| { //Failed |
| if( surfaceInput == nullptr || surfaceOutput == nullptr) |
| { |
| hr = CM_GPUCOPY_OUT_OF_RESOURCE; // user need to know whether the failure is caused by out of BufferUP. |
| } |
| else |
| { |
| hr = CM_FAILURE; |
| } |
| if(surfaceInput) m_device->DestroyBufferUP(surfaceInput); |
| if(surfaceOutput) m_device->DestroyBufferUP(surfaceOutput); |
| if(kernel && gpuCopyKernelParam) GPUCOPY_KERNEL_UNLOCK(gpuCopyKernelParam); |
| if(threadSpace) m_device->DestroyThreadSpace(threadSpace); |
| if(task) m_device->DestroyTask(task); |
| } |
| |
| return hr; |
| } |
| |
| //*---------------------------------------------------------------------------------------- |
| //| Purpose: Pop task from flushed Queue, Update surface state and Destroy the task |
| //| Notes: |
| //*---------------------------------------------------------------------------------------- |
| void CmQueueRT::PopTaskFromFlushedQueue() |
| { |
| CmTaskInternal* topTask = (CmTaskInternal*)m_flushedTasks.Pop(); |
| |
| if ( topTask != nullptr ) |
| { |
| CmEventRT *event = nullptr; |
| topTask->GetTaskEvent( event ); |
| if ( event != nullptr ) |
| { |
| LARGE_INTEGER nTime; |
| if ( !(MOS_QueryPerformanceCounter( (uint64_t*)&nTime.QuadPart )) ) |
| { |
| CM_ASSERTMESSAGE("Error: Query performace counter failure."); |
| } |
| else |
| { |
| event->SetCompleteTime( nTime ); |
| } |
| } |
| |
| #if MDF_SURFACE_CONTENT_DUMP |
| PCM_CONTEXT_DATA cmData = (PCM_CONTEXT_DATA)m_device->GetAccelData(); |
| if (cmData->cmHalState->dumpSurfaceContent) |
| { |
| int32_t taskId = 0; |
| if (event != nullptr) |
| { |
| event->GetTaskDriverId(taskId); |
| } |
| topTask->SurfaceDump(taskId); |
| } |
| #endif |
| |
| CmTaskInternal::Destroy( topTask ); |
| } |
| return; |
| } |
| |
| int32_t CmQueueRT::TouchFlushedTasks( ) |
| { |
| int32_t hr = CM_SUCCESS; |
| |
| if (m_flushedTasks.IsEmpty()) |
| { |
| if (!m_enqueuedTasks.IsEmpty()) |
| { |
| // if FlushedQueue is empty and EnqueuedQueue is not empty |
| // try flush task to FlushedQueue |
| hr = FlushTaskWithoutSync(); |
| if (FAILED(hr)) |
| { |
| return hr; |
| } |
| } |
| else |
| { // no task in flushedQueue and EnqueuedQueue |
| return CM_FAILURE; |
| } |
| } |
| |
| // Flush FlushedQueue |
| hr = QueryFlushedTasks(); |
| |
| return hr; |
| } |
| |
| //*----------------------------------------------------------------------------- |
| //! Flush the queue, i.e. submit all tasks in the queue to execute according |
| //! to their order in the the queue. The queue will be empty after flush, |
| //! This is a non-blocking call. i.e. it returns immediately without waiting for |
| //! GPU to finish the execution of tasks. |
| //! INPUT: |
| //! OUTPUT: |
| //! CM_SUCCESS if all tasks in the queue are submitted |
| //! CM_FAILURE otherwise. |
| //! More error code is coming. |
| //! |
| //*----------------------------------------------------------------------------- |
| int32_t CmQueueRT::QueryFlushedTasks() |
| { |
| int32_t hr = CM_SUCCESS; |
| |
| m_criticalSectionFlushedTask.Acquire(); |
| while( !m_flushedTasks.IsEmpty() ) |
| { |
| CmTaskInternal* task = (CmTaskInternal*)m_flushedTasks.Top(); |
| CM_CHK_NULL_GOTOFINISH_CMERROR(task); |
| |
| CM_STATUS status = CM_STATUS_FLUSHED ; |
| task->GetTaskStatus(status); |
| if( status == CM_STATUS_FINISHED ) |
| { |
| PopTaskFromFlushedQueue(); |
| } |
| else |
| { |
| // media reset |
| if (status == CM_STATUS_RESET) |
| { |
| PCM_CONTEXT_DATA cmData = (PCM_CONTEXT_DATA)m_device->GetAccelData(); |
| |
| // Clear task status table in Cm Hal State |
| int32_t taskId; |
| CmEventRT*pTopTaskEvent; |
| task->GetTaskEvent(pTopTaskEvent); |
| CM_CHK_NULL_GOTOFINISH_CMERROR(pTopTaskEvent); |
| |
| pTopTaskEvent->GetTaskDriverId(taskId); |
| cmData->cmHalState->taskStatusTable[taskId] = CM_INVALID_INDEX; |
| |
| //Pop task and Destroy it |
| PopTaskFromFlushedQueue(); |
| } |
| |
| // It is an in-order queue, if this one hasn't finshed, |
| // the following ones haven't finished either. |
| break; |
| } |
| } |
| |
| finish: |
| m_criticalSectionFlushedTask.Release(); |
| |
| return hr; |
| } |
| |
| //*----------------------------------------------------------------------------- |
| //! This is a blocking call. It will NOT return untill |
| //! all tasks in GPU and all tasks in queue finishes execution. |
| //! It will first flush the queue if the queue is not empty. |
| //! INPUT: |
| //! OUTPUT: |
| //! CM_SUCCESS if all tasks finish execution. |
| //! CM_FAILURE otherwise. |
| //! More error code is coming. |
| //*----------------------------------------------------------------------------- |
| CM_RT_API int32_t CmQueueRT::DestroyEvent( CmEvent* & event ) |
| { |
| |
| CLock Lock(m_criticalSectionEvent); |
| |
| if (event == nullptr) |
| { |
| return CM_FAILURE; |
| } |
| |
| uint32_t index = 0; |
| |
| CmEventRT *eventRT = dynamic_cast<CmEventRT *>(event); |
| if (eventRT == nullptr) |
| { |
| return DestroyEventFast(event); |
| } |
| eventRT->GetIndex(index); |
| CM_ASSERT( m_eventArray.GetElement( index ) == eventRT ); |
| |
| int32_t status = CmEventRT::Destroy( eventRT ); |
| if( status == CM_SUCCESS && eventRT == nullptr) |
| { |
| m_eventArray.SetElement(index, nullptr); |
| } |
| |
| // Should return nullptr to application even the event is not destroyed |
| // since its reference count is not zero |
| event = nullptr; |
| |
| return status; |
| } |
| |
| //*----------------------------------------------------------------------------- |
| //| Purpose: Clean the Queue if its tasks time out |
| //| Returns: Result of the operation. |
| //*----------------------------------------------------------------------------- |
| int32_t CmQueueRT::CleanQueue( ) |
| { |
| |
| int32_t status = CM_SUCCESS; |
| |
| // Maybe not necessary since |
| // it is called by ~CmDevice only |
| // Update: necessary because it calls FlushBlockWithoutSync |
| if( !m_enqueuedTasks.IsEmpty() ) |
| { |
| // If there are tasks not flushed (i.e. not send to driver ) |
| // wait untill all such tasks are flushed |
| FlushTaskWithoutSync( true ); |
| } |
| CM_ASSERT( m_enqueuedTasks.IsEmpty() ); |
| |
| //Used for timeout detection |
| LARGE_INTEGER freq; |
| MOS_QueryPerformanceFrequency((uint64_t*)&freq.QuadPart); |
| LARGE_INTEGER start; |
| MOS_QueryPerformanceCounter((uint64_t*)&start.QuadPart); |
| int64_t timeout = start.QuadPart + (CM_MAX_TIMEOUT * freq.QuadPart * m_flushedTasks.GetCount()); //Count to timeout at |
| |
| while( !m_flushedTasks.IsEmpty() && status != CM_EXCEED_MAX_TIMEOUT ) |
| { |
| QueryFlushedTasks(); |
| |
| LARGE_INTEGER current; |
| MOS_QueryPerformanceCounter((uint64_t*)¤t.QuadPart); |
| if( current.QuadPart > timeout ) |
| status = CM_EXCEED_MAX_TIMEOUT; |
| } |
| |
| return status; |
| } |
| |
| CM_QUEUE_CREATE_OPTION &CmQueueRT::GetQueueOption() |
| { |
| return m_queueOption; |
| } |
| |
| //*----------------------------------------------------------------------------- |
| //| Purpose: Get the count of task in queue |
| //| Returns: Result of the operation. |
| //*----------------------------------------------------------------------------- |
| int32_t CmQueueRT::GetTaskCount( uint32_t& numTasks ) |
| { |
| numTasks = m_enqueuedTasks.GetCount() + m_flushedTasks.GetCount(); |
| return CM_SUCCESS; |
| } |
| |
| //*----------------------------------------------------------------------------- |
| //| Purpose: Use GPU to init Surface2D |
| //| Returns: result of operation |
| //*----------------------------------------------------------------------------- |
| CM_RT_API int32_t CmQueueRT::EnqueueInitSurface2D( CmSurface2D* surf2D, const uint32_t initValue, CmEvent* &event) |
| { |
| INSERT_API_CALL_LOG(); |
| |
| if (!m_device->HasGpuInitKernel()) |
| { |
| return CM_NOT_IMPLEMENTED; |
| } |
| |
| int32_t hr = CM_SUCCESS; |
| uint32_t width = 0; |
| uint32_t height = 0; |
| uint32_t sizePerPixel = 0; |
| CmProgram *gpuInitKernelProgram = nullptr; |
| CmKernel *kernel = nullptr; |
| SurfaceIndex *outputIndexCM = nullptr; |
| CmThreadSpace *threadSpace = nullptr; |
| CmTask *gpuCopyTask = nullptr; |
| uint32_t threadWidth = 0; |
| uint32_t threadHeight = 0; |
| uint32_t threadNum = 0; |
| CmSurfaceManager* surfaceMgr = nullptr; |
| CM_SURFACE_FORMAT format = CM_SURFACE_FORMAT_INVALID; |
| |
| if(!surf2D) |
| { |
| CM_ASSERTMESSAGE("Error: Pointer to surface 2d is null."); |
| return CM_FAILURE; |
| } |
| CmSurface2DRT *surf2DRT = static_cast<CmSurface2DRT *>(surf2D); |
| |
| CM_CHK_CMSTATUS_GOTOFINISH(m_device->LoadPredefinedInitKernel(gpuInitKernelProgram)); |
| |
| CM_CHK_CMSTATUS_GOTOFINISH(surf2DRT->GetSurfaceDesc(width, height, format,sizePerPixel)); |
| |
| m_device->GetSurfaceManager(surfaceMgr); |
| CM_CHK_NULL_GOTOFINISH_CMERROR(surfaceMgr); |
| |
| if (format == CM_SURFACE_FORMAT_NV12 || format == CM_SURFACE_FORMAT_P010 || format == CM_SURFACE_FORMAT_P016) |
| { |
| CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateKernel( gpuInitKernelProgram, _NAME( surfaceCopy_set_NV12 ), kernel, "PredefinedGPUCopyKernel")); |
| } |
| else |
| { |
| CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateKernel( gpuInitKernelProgram, _NAME( surfaceCopy_set ), kernel, "PredefinedGPUCopyKernel" )); |
| } |
| CM_CHK_NULL_GOTOFINISH_CMERROR(kernel); |
| CM_CHK_CMSTATUS_GOTOFINISH(surf2D->GetIndex( outputIndexCM )); |
| |
| threadWidth = ( uint32_t )ceil( ( double )width*sizePerPixel/BLOCK_PIXEL_WIDTH/4 ); |
| threadHeight = ( uint32_t )ceil( ( double )height/BLOCK_HEIGHT ); |
| threadNum = threadWidth * threadHeight; |
| CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetThreadCount( threadNum )); |
| |
| CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateThreadSpace( threadWidth, threadHeight, threadSpace )); |
| CM_CHK_NULL_GOTOFINISH_CMERROR(threadSpace); |
| |
| CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 0, sizeof( uint32_t ), &initValue )); |
| CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 1, sizeof( SurfaceIndex ), outputIndexCM )); |
| |
| CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateTask(gpuCopyTask)); |
| CM_CHK_NULL_GOTOFINISH_CMERROR(gpuCopyTask); |
| |
| CM_CHK_CMSTATUS_GOTOFINISH(gpuCopyTask->AddKernel( kernel )); |
| |
| CM_CHK_CMSTATUS_GOTOFINISH(EnqueueFast(gpuCopyTask, event, threadSpace)); |
| |
| finish: |
| |
| if (kernel) m_device->DestroyKernel( kernel ); |
| if (gpuCopyTask) m_device->DestroyTask(gpuCopyTask); |
| if (threadSpace) m_device->DestroyThreadSpace(threadSpace); |
| |
| return hr; |
| } |
| |
| //*----------------------------------------------------------------------------- |
| //! Flush a geneal task to HAL CM layer for execution. |
| //! This is a non-blocking call. i.e. it returs immediately without waiting for |
| //! GPU to finish the execution of tasks. |
| //! INPUT: task -- Pointer to CmTaskInternal object |
| //! OUTPUT: |
| //! CM_SUCCESS if all tasks in the queue are submitted |
| //! CM_FAILURE otherwise. |
| //*----------------------------------------------------------------------------- |
| int32_t CmQueueRT::FlushGeneralTask(CmTaskInternal* task) |
| { |
| CM_RETURN_CODE hr = CM_SUCCESS; |
| CM_HAL_EXEC_TASK_PARAM param; |
| PCM_HAL_KERNEL_PARAM kernelParam = nullptr; |
| CmKernelData* kernelData = nullptr; |
| uint32_t kernelDataSize = 0; |
| PCM_CONTEXT_DATA cmData = nullptr; |
| CmEventRT* event = nullptr; |
| uint32_t totalThreadCount= 0; |
| uint32_t count = 0; |
| PCM_HAL_KERNEL_PARAM tempData = nullptr; |
| uint32_t maxTSWidth = 0; |
| bool hasThreadArg = false; |
| |
| CmSafeMemSet( ¶m, 0, sizeof( CM_HAL_EXEC_TASK_PARAM ) ); |
| |
| //GT-PIN |
| if(m_device->CheckGTPinEnabled()) |
| { |
| CM_CHK_CMSTATUS_GOTOFINISH(task->GetKernelSurfInfo(param.surfEntryInfoArrays)); |
| } |
| |
| task->GetKernelCount( count ); |
| param.numKernels = count; |
| |
| param.kernels = MOS_NewArray(PCM_HAL_KERNEL_PARAM,count); |
| param.kernelSizes = MOS_NewArray(uint32_t,count); |
| param.kernelCurbeOffset = MOS_NewArray(uint32_t,count); |
| param.queueOption = m_queueOption; |
| |
| CM_CHK_NULL_GOTOFINISH(param.kernels, CM_OUT_OF_HOST_MEMORY); |
| CM_CHK_NULL_GOTOFINISH(param.kernelSizes, CM_OUT_OF_HOST_MEMORY); |
| CM_CHK_NULL_GOTOFINISH(param.kernelCurbeOffset, CM_OUT_OF_HOST_MEMORY); |
| |
| for( uint32_t i = 0; i < count; i ++ ) |
| { |
| task->GetKernelData( i, kernelData ); |
| CM_CHK_NULL_GOTOFINISH_CMERROR(kernelData); |
| |
| kernelParam = kernelData->GetHalCmKernelData(); |
| CM_CHK_NULL_GOTOFINISH_CMERROR(kernelParam); |
| |
| hasThreadArg |= kernelParam->perThreadArgExisted; |
| |
| task->GetKernelDataSize( i, kernelDataSize ); |
| if(kernelDataSize == 0) |
| { |
| CM_ASSERTMESSAGE("Error: Invalid kernel data size."); |
| hr = CM_FAILURE; |
| goto finish; |
| } |
| |
| tempData = kernelData->GetHalCmKernelData(); |
| |
| param.kernels[ i ] = tempData; |
| param.kernelSizes[ i ] = kernelDataSize; |
| param.kernelCurbeOffset[ i ] = task->GetKernelCurbeOffset(i); |
| param.globalSurfaceUsed |= tempData->globalSurfaceUsed; |
| param.kernelDebugEnabled |= tempData->kernelDebugEnabled; |
| } |
| |
| /* |
| * Preset the default TS width/height/dependency: |
| * TS width = MOS_MIN(CM_MAX_THREADSPACE_WIDTH, threadcount) |
| * TS height = totalThreadCount/CM_MAX_THREADSPACE_WIDTH + 1 |
| * dependency = CM_NONE_DEPENDENCY |
| * For threadSpace is nullptr case, we will pass the default TS width/height/dependency to driver |
| * For threadSpace is valid case, the TS width/height/dependency will be update according to thread space set by user. |
| */ |
| task->GetTotalThreadCount(totalThreadCount); |
| |
| if (hasThreadArg) |
| { |
| maxTSWidth = CM_MAX_THREADSPACE_WIDTH_FOR_MW + 1; // 512 allowed for media object |
| } |
| else |
| { |
| maxTSWidth = CM_MAX_THREADSPACE_WIDTH_FOR_MW; // 511 for media walker |
| } |
| |
| param.threadSpaceWidth = (totalThreadCount > maxTSWidth) ? maxTSWidth : totalThreadCount; |
| if(totalThreadCount%maxTSWidth) |
| { |
| param.threadSpaceHeight = totalThreadCount/maxTSWidth + 1; |
| } |
| else |
| { |
| param.threadSpaceHeight = totalThreadCount/maxTSWidth; |
| } |
| |
| param.dependencyPattern = CM_NONE_DEPENDENCY; |
| |
| if (task->IsThreadSpaceCreated()) //scoreboard data preparation |
| { |
| if(task->IsThreadCoordinatesExisted()) |
| { |
| param.threadCoordinates = MOS_NewArray(PCM_HAL_SCOREBOARD, count); |
| param.dependencyMasks = MOS_NewArray(PCM_HAL_MASK_AND_RESET, count); |
| |
| CM_CHK_NULL_GOTOFINISH(param.threadCoordinates, CM_OUT_OF_HOST_MEMORY); |
| CM_CHK_NULL_GOTOFINISH(param.dependencyMasks, CM_OUT_OF_HOST_MEMORY); |
| for(uint32_t i=0; i<count; i++) |
| { |
| void *kernelCoordinates = nullptr; |
| void *dependencyMasks = nullptr; |
| task->GetKernelCoordinates(i, kernelCoordinates); |
| task->GetKernelDependencyMasks(i, dependencyMasks); |
| param.threadCoordinates[i] = (PCM_HAL_SCOREBOARD)kernelCoordinates; |
| param.dependencyMasks[i] = (PCM_HAL_MASK_AND_RESET)dependencyMasks; |
| } |
| } |
| else |
| { |
| param.threadCoordinates = nullptr; |
| } |
| |
| task->GetDependencyPattern(param.dependencyPattern); |
| |
| task->GetThreadSpaceSize(param.threadSpaceWidth, param.threadSpaceHeight); |
| |
| task->GetWalkingPattern(param.walkingPattern); |
| |
| if( task->CheckWalkingParametersSet( ) ) |
| { |
| param.walkingParamsValid = 1; |
| CM_CHK_CMSTATUS_GOTOFINISH(task->GetWalkingParameters(param.walkingParams)); |
| } |
| else |
| { |
| param.walkingParamsValid = 0; |
| } |
| |
| if( task->CheckDependencyVectorsSet( ) ) |
| { |
| param.dependencyVectorsValid = 1; |
| CM_CHK_CMSTATUS_GOTOFINISH(task->GetDependencyVectors(param.dependencyVectors)); |
| } |
| else |
| { |
| param.dependencyVectorsValid = 0; |
| } |
| } |
| if (param.threadSpaceWidth == 0) |
| { |
| CM_ASSERTMESSAGE("Error: Invalid thread space."); |
| hr = CM_INVALID_THREAD_SPACE; |
| goto finish; |
| } |
| task->GetColorCountMinusOne(param.colorCountMinusOne); |
| task->GetMediaWalkerGroupSelect(param.mediaWalkerGroupSelect); |
| |
| param.syncBitmap = task->GetSyncBitmap(); |
| param.conditionalEndBitmap = task->GetConditionalEndBitmap(); |
| param.userDefinedMediaState = task->GetMediaStatePtr(); |
| CmSafeMemCopy(param.conditionalEndInfo, task->GetConditionalEndInfo(), sizeof(param.conditionalEndInfo)); |
| CmSafeMemCopy(¶m.taskConfig, task->GetTaskConfig(), sizeof(param.taskConfig)); |
| cmData = (PCM_CONTEXT_DATA)m_device->GetAccelData(); |
| |
| CM_CHK_MOSSTATUS_GOTOFINISH_CMERROR(cmData->cmHalState->pfnSetPowerOption(cmData->cmHalState, task->GetPowerOption())); |
| |
| cmData->cmHalState->osInterface->pfnSetGpuContext(cmData->cmHalState->osInterface, (MOS_GPU_CONTEXT)m_queueOption.GPUContext); |
| RegisterSyncEvent(); |
| |
| CM_CHK_MOSSTATUS_GOTOFINISH_CMERROR(cmData->cmHalState->pfnExecuteTask(cmData->cmHalState, ¶m)); |
| |
| if( param.taskIdOut < 0 ) |
| { |
| CM_ASSERTMESSAGE("Error: Invalid task ID."); |
| hr = CM_FAILURE; |
| goto finish; |
| } |
| |
| TASK_LOG(task); |
| |
| task->GetTaskEvent( event ); |
| CM_CHK_NULL_GOTOFINISH_CMERROR(event); |
| CM_CHK_CMSTATUS_GOTOFINISH(event->SetTaskDriverId( param.taskIdOut )); |
| CM_CHK_CMSTATUS_GOTOFINISH(event->SetTaskOsData( param.osData )); |
| CM_CHK_CMSTATUS_GOTOFINISH(task->ResetKernelDataStatus()); |
| |
| //GT-PIN |
| if(m_device->CheckGTPinEnabled()) |
| { |
| //No need to clear the SurEntryInfoArrays here. It will be destored by CmInternalTask |
| CM_CHK_CMSTATUS_GOTOFINISH(event->SetSurfaceDetails(param.surfEntryInfoArrays)); |
| } |
| |
| finish: |
| MosSafeDeleteArray( param.kernels ); |
| MosSafeDeleteArray( param.kernelSizes ); |
| MosSafeDeleteArray( param.threadCoordinates); |
| MosSafeDeleteArray( param.dependencyMasks); |
| MosSafeDeleteArray( param.kernelCurbeOffset); |
| |
| return hr; |
| } |
| |
| //*----------------------------------------------------------------------------- |
| //! Flush a thread group based task to HAL CM layer for execution. |
| //! This is a non-blocking call. i.e. it returs immediately without waiting for |
| //! GPU to finish the execution of tasks. |
| //! INPUT: task -- Pointer to CmTaskInternal object |
| //! OUTPUT: |
| //! CM_SUCCESS if all tasks in the queue are submitted |
| //! CM_FAILURE otherwise. |
| //*----------------------------------------------------------------------------- |
| int32_t CmQueueRT::FlushGroupTask(CmTaskInternal* task) |
| { |
| CM_RETURN_CODE hr = CM_SUCCESS; |
| |
| CM_HAL_EXEC_TASK_GROUP_PARAM param; |
| CmKernelData* kernelData = nullptr; |
| uint32_t kernelDataSize = 0; |
| uint32_t count = 0; |
| PCM_CONTEXT_DATA cmData = nullptr; |
| CmEventRT * event = nullptr; |
| PCM_HAL_KERNEL_PARAM tempData = nullptr; |
| |
| CmSafeMemSet( ¶m, 0, sizeof( CM_HAL_EXEC_TASK_GROUP_PARAM ) ); |
| |
| //GT-PIN |
| if(this->m_device->CheckGTPinEnabled()) |
| { |
| CM_CHK_CMSTATUS_GOTOFINISH(task->GetKernelSurfInfo(param.surEntryInfoArrays)); |
| } |
| |
| task->GetKernelCount( count ); |
| param.numKernels = count; |
| |
| param.kernels = MOS_NewArray(PCM_HAL_KERNEL_PARAM, count); |
| param.kernelSizes = MOS_NewArray(uint32_t, count); |
| param.kernelCurbeOffset = MOS_NewArray(uint32_t, count); |
| param.queueOption = m_queueOption; |
| param.mosVeHintParams = (m_usingVirtualEngine)? &m_mosVeHintParams: nullptr; |
| |
| CmSafeMemCopy(¶m.taskConfig, task->GetTaskConfig(), sizeof(param.taskConfig)); |
| CM_CHK_NULL_GOTOFINISH_CMERROR(param.kernels); |
| CM_CHK_NULL_GOTOFINISH_CMERROR(param.kernelSizes); |
| CM_CHK_NULL_GOTOFINISH_CMERROR(param.kernelCurbeOffset); |
| |
| for( uint32_t i = 0; i < count; i ++ ) |
| { |
| task->GetKernelData( i, kernelData ); |
| CM_CHK_NULL_GOTOFINISH_CMERROR(kernelData); |
| |
| task->GetKernelDataSize( i, kernelDataSize ); |
| if( kernelDataSize == 0) |
| { |
| CM_ASSERTMESSAGE("Error: Invalid kernel data size."); |
| hr = CM_FAILURE; |
| goto finish; |
| } |
| |
| tempData = kernelData->GetHalCmKernelData( ); |
| |
| param.kernels[ i ] = tempData; |
| param.kernelSizes[ i ] = kernelDataSize; |
| param.kernelCurbeOffset [ i ] = task->GetKernelCurbeOffset(i); |
| param.globalSurfaceUsed |= tempData->globalSurfaceUsed; |
| param.kernelDebugEnabled |= tempData->kernelDebugEnabled; |
| } |
| |
| task->GetSLMSize(param.slmSize); |
| if(param.slmSize > MAX_SLM_SIZE_PER_GROUP_IN_1K) |
| { |
| CM_ASSERTMESSAGE("Error: SLM size exceeds the maximum per group."); |
| hr = CM_EXCEED_MAX_SLM_SIZE; |
| goto finish; |
| } |
| |
| if (task->IsThreadGroupSpaceCreated())//thread group size |
| { |
| task->GetThreadGroupSpaceSize(param.threadSpaceWidth, param.threadSpaceHeight, |
| param.threadSpaceDepth, param.groupSpaceWidth, |
| param.groupSpaceHeight, param.groupSpaceDepth); |
| } |
| |
| param.syncBitmap = task->GetSyncBitmap(); |
| param.conditionalEndBitmap = task->GetConditionalEndBitmap(); |
| param.userDefinedMediaState = task->GetMediaStatePtr(); |
| CmSafeMemCopy(param.conditionalEndInfo, task->GetConditionalEndInfo(), sizeof(param.conditionalEndInfo)); |
| CmSafeMemCopy(param.krnExecCfg, task->GetKernelExecuteConfig(), sizeof(param.krnExecCfg)); |
| |
| // Call HAL layer to execute pfnExecuteGroupTask |
| cmData = (PCM_CONTEXT_DATA)m_device->GetAccelData(); |
| |
| CM_CHK_MOSSTATUS_GOTOFINISH_CMERROR( cmData->cmHalState->pfnSetPowerOption( cmData->cmHalState, task->GetPowerOption() ) ); |
| |
| CM_CHK_MOSSTATUS_GOTOFINISH_CMERROR( |
| ExecuteGroupTask( |
| cmData->cmHalState, |
| ¶m, |
| static_cast<MOS_GPU_CONTEXT>(m_queueOption.GPUContext))); |
| |
| if( param.taskIdOut < 0 ) |
| { |
| CM_ASSERTMESSAGE("Error: Invalid task ID."); |
| hr = CM_FAILURE; |
| goto finish; |
| } |
| TASK_LOG(task); |
| task->GetTaskEvent( event ); |
| CM_CHK_NULL_GOTOFINISH_CMERROR( event ); |
| CM_CHK_CMSTATUS_GOTOFINISH(event->SetTaskDriverId( param.taskIdOut )); |
| CM_CHK_CMSTATUS_GOTOFINISH(event->SetTaskOsData( param.osData )); |
| CM_CHK_CMSTATUS_GOTOFINISH(task->ResetKernelDataStatus()); |
| |
| //GT-PIN |
| if(this->m_device->CheckGTPinEnabled()) |
| { |
| CM_CHK_CMSTATUS_GOTOFINISH(event->SetSurfaceDetails(param.surEntryInfoArrays)); |
| } |
| |
| finish: |
| MosSafeDeleteArray( param.kernels ); |
| MosSafeDeleteArray( param.kernelSizes ); |
| MosSafeDeleteArray( param.kernelCurbeOffset); |
| |
| return hr; |
| } |
| |
| //*----------------------------------------------------------------------------- |
| //! Flush a VEBOX task to HAL CM layer for execution. |
| //! This is a non-blocking call. i.e. it returs immediately without waiting for |
| //! GPU to finish the execution of tasks. |
| //! INPUT: task -- Pointer to CmTaskInternal object |
| //! OUTPUT: |
| //! CM_SUCCESS if all tasks in the queue are submitted |
| //! CM_FAILURE otherwise. |
| //*----------------------------------------------------------------------------- |
| int32_t CmQueueRT::FlushVeboxTask(CmTaskInternal* task) |
| { |
| CM_RETURN_CODE hr = CM_SUCCESS; |
| |
| CM_HAL_EXEC_VEBOX_TASK_PARAM param; |
| PCM_CONTEXT_DATA cmData = nullptr; |
| CmEventRT * event = nullptr; |
| uint8_t *stateData = nullptr; |
| uint8_t *surfaceData = nullptr; |
| CmBuffer_RT * temp = nullptr; |
| |
| CmSafeMemSet( ¶m, 0, sizeof( CM_HAL_EXEC_VEBOX_TASK_PARAM ) ); |
| |
| //Set VEBOX state data pointer and size |
| //Set VEBOX surface data pointer and size |
| CM_VEBOX_STATE cmVeboxState; |
| CmBufferUP *veboxParamBuf = nullptr; |
| CM_VEBOX_SURFACE_DATA cmVeboxSurfaceData; |
| task->GetVeboxState(cmVeboxState); |
| task->GetVeboxParam(veboxParamBuf); |
| task->GetVeboxSurfaceData(cmVeboxSurfaceData); |
| CM_CHK_NULL_GOTOFINISH_CMERROR(veboxParamBuf); |
| |
| temp = static_cast<CmBuffer_RT*>(veboxParamBuf); |
| temp->GetHandle(param.veboxParamIndex); |
| |
| param.cmVeboxState = cmVeboxState; |
| param.veboxParam = veboxParamBuf; |
| |
| param.veboxSurfaceData = cmVeboxSurfaceData; |
| |
| param.queueOption = m_queueOption; |
| |
| //Set VEBOX task id to -1 |
| param.taskIdOut = -1; |
| |
| cmData = (PCM_CONTEXT_DATA)m_device->GetAccelData(); |
| |
| cmData->cmHalState->osInterface->pfnSetGpuContext(cmData->cmHalState->osInterface, (MOS_GPU_CONTEXT)m_queueOption.GPUContext); |
| RegisterSyncEvent(); |
| |
| CM_CHK_MOSSTATUS_GOTOFINISH_CMERROR( cmData->cmHalState->pfnExecuteVeboxTask( cmData->cmHalState, ¶m ) ); |
| |
| if( param.taskIdOut < 0 ) |
| { |
| CM_ASSERTMESSAGE("Error: Invalid task ID."); |
| hr = CM_FAILURE; |
| goto finish; |
| } |
| |
| task->GetTaskEvent( event ); |
| CM_CHK_NULL_GOTOFINISH_CMERROR( event ); |
| CM_CHK_CMSTATUS_GOTOFINISH(event->SetTaskDriverId( param.taskIdOut )); |
| CM_CHK_CMSTATUS_GOTOFINISH(event->SetTaskOsData( param.osData )); |
| |
| finish: |
| return hr; |
| } |
| |
| //*----------------------------------------------------------------------------- |
| //! Flush the queue, i.e. submit all tasks in the queue to execute according |
| //! to their order in the the queue. The queue will be empty after flush, |
| //! This is a non-blocking call. i.e. it returns immediately without waiting for |
| //! GPU to finish the execution of tasks. |
| //! INPUT: |
| //! OUTPUT: |
| //! CM_SUCCESS if all tasks in the queue are submitted |
| //! CM_FAILURE otherwise. |
| //*----------------------------------------------------------------------------- |
| int32_t CmQueueRT::FlushEnqueueWithHintsTask( CmTaskInternal* task ) |
| { |
| CM_RETURN_CODE hr = CM_SUCCESS; |
| CM_HAL_EXEC_HINTS_TASK_PARAM param; |
| PCM_CONTEXT_DATA cmData = nullptr; |
| CmKernelData* kernelData = nullptr; |
| uint32_t kernelDataSize = 0; |
| uint32_t count = 0; |
| CmEventRT *event = nullptr; |
| PCM_HAL_KERNEL_PARAM tempData = nullptr; |
| |
| CmSafeMemSet( ¶m, 0, sizeof( CM_HAL_EXEC_HINTS_TASK_PARAM ) ); |
| |
| task->GetKernelCount ( count ); |
| param.numKernels = count; |
| |
| param.kernels = MOS_NewArray(PCM_HAL_KERNEL_PARAM, count); |
| param.kernelSizes = MOS_NewArray(uint32_t, count); |
| param.kernelCurbeOffset = MOS_NewArray(uint32_t, count); |
| param.queueOption = m_queueOption; |
| |
| CM_CHK_NULL_GOTOFINISH_CMERROR(param.kernels); |
| CM_CHK_NULL_GOTOFINISH_CMERROR(param.kernelSizes); |
| CM_CHK_NULL_GOTOFINISH_CMERROR(param.kernelCurbeOffset); |
| |
| task->GetHints(param.hints); |
| task->GetNumTasksGenerated(param.numTasksGenerated); |
| task->GetLastTask(param.isLastTask); |
| |
| for( uint32_t i = 0; i < count; i ++ ) |
| { |
| task->GetKernelData( i, kernelData ); |
| CM_CHK_NULL_GOTOFINISH_CMERROR( kernelData ); |
| |
| task->GetKernelDataSize( i, kernelDataSize ); |
| if( kernelDataSize == 0 ) |
| { |
| CM_ASSERTMESSAGE("Error: Invalid kernel data size."); |
| hr = CM_FAILURE; |
| goto finish; |
| } |
| |
| tempData = kernelData->GetHalCmKernelData(); |
| |
| param.kernels[ i ] = tempData; |
| param.kernelSizes[ i ] = kernelDataSize; |
| param.kernelCurbeOffset[ i ] = task->GetKernelCurbeOffset(i); |
| } |
| |
| param.userDefinedMediaState = task->GetMediaStatePtr(); |
| cmData = (PCM_CONTEXT_DATA)m_device->GetAccelData(); |
| CM_CHK_NULL_GOTOFINISH_CMERROR(cmData); |
| |
| CM_CHK_MOSSTATUS_GOTOFINISH_CMERROR(cmData->cmHalState->pfnSetPowerOption(cmData->cmHalState, task->GetPowerOption())); |
| |
| cmData->cmHalState->osInterface->pfnSetGpuContext(cmData->cmHalState->osInterface, (MOS_GPU_CONTEXT)m_queueOption.GPUContext); |
| RegisterSyncEvent(); |
| |
| CM_CHK_MOSSTATUS_GOTOFINISH_CMERROR(cmData->cmHalState->pfnExecuteHintsTask(cmData->cmHalState, ¶m)); |
| |
| if( param.taskIdOut < 0 ) |
| { |
| CM_ASSERTMESSAGE("Error: Invalid task ID."); |
| hr = CM_FAILURE; |
| goto finish; |
| } |
| |
| TASK_LOG(task); |
| |
| task->GetTaskEvent( event ); |
| CM_CHK_NULL_GOTOFINISH_CMERROR( event ); |
| CM_CHK_CMSTATUS_GOTOFINISH(event->SetTaskDriverId( param.taskIdOut )); |
| CM_CHK_CMSTATUS_GOTOFINISH(event->SetTaskOsData( param.osData )); |
| CM_CHK_CMSTATUS_GOTOFINISH(task->ResetKernelDataStatus()); |
| |
| finish: |
| |
| MosSafeDeleteArray( param.kernels ); |
| MosSafeDeleteArray( param.kernelSizes ); |
| MosSafeDeleteArray( param.kernelCurbeOffset ); |
| |
| return hr; |
| } |
| |
| //*----------------------------------------------------------------------------- |
| //! Flush the queue, i.e. submit all tasks in the queue to execute according |
| //! to their order in the the queue. The queue will be empty after flush, |
| //! This is a non-blocking call. i.e. it returs immediately without waiting for |
| //! GPU to finish the execution of tasks. |
| //! INPUT: |
| //! OUTPUT: |
| //! CM_SUCCESS if all tasks in the queue are submitted |
| //! CM_FAILURE otherwise. |
| //*----------------------------------------------------------------------------- |
| int32_t CmQueueRT::FlushTaskWithoutSync( bool flushBlocked ) |
| { |
| int32_t hr = CM_SUCCESS; |
| CmTaskInternal* task = nullptr; |
| uint32_t taskType = CM_TASK_TYPE_DEFAULT; |
| uint32_t freeSurfNum = 0; |
| CmSurfaceManager* surfaceMgr = nullptr; |
| CSync* surfaceLock = nullptr; |
| |
| m_criticalSectionHalExecute.Acquire(); // Enter HalCm Execute Protection |
| |
| while( !m_enqueuedTasks.IsEmpty() ) |
| { |
| uint32_t flushedTaskCount = m_flushedTasks.GetCount(); |
| if ( flushBlocked ) |
| { |
| while( flushedTaskCount >= m_halMaxValues->maxTasks ) |
| { |
| // If the task count in flushed queue is no less than hw restrictiion, |
| // query the staus of flushed task queue. Remove any finished tasks from the queue |
| QueryFlushedTasks(); |
| flushedTaskCount = m_flushedTasks.GetCount(); |
| } |
| } |
| else |
| { |
| if( flushedTaskCount >= m_halMaxValues->maxTasks ) |
| { |
| // If the task count in flushed queue is no less than hw restrictiion, |
| // query the staus of flushed task queue. Remove any finished tasks from the queue |
| QueryFlushedTasks(); |
| flushedTaskCount = m_flushedTasks.GetCount(); |
| if( flushedTaskCount >= m_halMaxValues->maxTasks ) |
| { |
| // If none of flushed tasks finishes, we can't flush more taks. |
| break; |
| } |
| } |
| } |
| |
| task = (CmTaskInternal*)m_enqueuedTasks.Pop(); |
| CM_CHK_NULL_GOTOFINISH_CMERROR( task ); |
| |
| CmNotifierGroup *notifiers = m_device->GetNotifiers(); |
| if (notifiers != nullptr) |
| { |
| notifiers->NotifyTaskFlushed(m_device, task); |
| } |
| |
| task->GetTaskType(taskType); |
| |
| switch(taskType) |
| { |
| case CM_INTERNAL_TASK_WITH_THREADSPACE: |
| hr = FlushGeneralTask(task); |
| break; |
| |
| case CM_INTERNAL_TASK_WITH_THREADGROUPSPACE: |
| hr = FlushGroupTask(task); |
| break; |
| |
| case CM_INTERNAL_TASK_VEBOX: |
| hr = FlushVeboxTask(task); |
| break; |
| |
| case CM_INTERNAL_TASK_ENQUEUEWITHHINTS: |
| hr = FlushEnqueueWithHintsTask(task); |
| break; |
| |
| default: // by default, assume the task is considered as general task: CM_INTERNAL_TASK_WITH_THREADSPACE |
| hr = FlushGeneralTask(task); |
| break; |
| } |
| |
| if(hr == CM_SUCCESS) |
| { |
| m_flushedTasks.Push( task ); |
| task->VtuneSetFlushTime(); // Record Flush Time |
| } |
| else |
| { |
| // Failed to flush, destroy the task. |
| CmTaskInternal::Destroy( task ); |
| } |
| |
| } // loop for task |
| |
| QueryFlushedTasks(); |
| |
| finish: |
| m_criticalSectionHalExecute.Release();//Leave HalCm Execute Protection |
| |
| //Delayed destroy for resource |
| m_device->GetSurfaceManager(surfaceMgr); |
| if (!surfaceMgr) |
| { |
| CM_ASSERTMESSAGE("Error: Pointer to surface manager is null."); |
| return CM_NULL_POINTER; |
| } |
| |
| surfaceLock = m_device->GetSurfaceCreationLock(); |
| if (surfaceLock == nullptr) |
| { |
| CM_ASSERTMESSAGE("Error: Pointer to surface creation lock is null."); |
| return CM_NULL_POINTER; |
| } |
| surfaceLock->Acquire(); |
| surfaceMgr->RefreshDelayDestroySurfaces(freeSurfNum); |
| surfaceLock->Release(); |
| |
| return hr; |
| } |
| |
| //*----------------------------------------------------------------------------- |
| //| Purpose: Enqueue a Vebox Task |
| //| Arguments : |
| //| pVebox_G75 [in] Pointer to a CmVebox object |
| //| event [in] Reference to the pointer to Event |
| //| |
| //| Returns: Result of the operation. |
| //*----------------------------------------------------------------------------- |
| CM_RT_API int32_t CmQueueRT::EnqueueVebox(CmVebox * vebox, CmEvent* & event) |
| { |
| INSERT_API_CALL_LOG(); |
| |
| int32_t hr = CM_SUCCESS; |
| CmTaskInternal* task = nullptr; |
| int32_t taskDriverId = -1; |
| bool isEventVisible = (event == CM_NO_EVENT)? false:true; |
| CmEventRT *eventRT = static_cast<CmEventRT *>(event); |
| |
| //Check if the input is valid |
| if ( vebox == nullptr ) |
| { |
| CM_ASSERTMESSAGE("Error: Pointer to vebox is null."); |
| return CM_NULL_POINTER; |
| } |
| CmVeboxRT *veboxRT = static_cast<CmVeboxRT *>(vebox); |
| CM_CHK_CMSTATUS_GOTOFINISH(CmTaskInternal::Create(m_device, veboxRT, task )); |
| |
| LARGE_INTEGER nEnqueueTime; |
| if ( !(MOS_QueryPerformanceCounter( (uint64_t*)&nEnqueueTime.QuadPart )) ) |
| { |
| CM_ASSERTMESSAGE("Error: Query Performance counter failure."); |
| hr = CM_FAILURE; |
| goto finish; |
| } |
| |
| CM_CHK_CMSTATUS_GOTOFINISH(CreateEvent(task, isEventVisible, taskDriverId, eventRT)); |
| |
| if ( eventRT != nullptr ) |
| { |
| eventRT->SetEnqueueTime( nEnqueueTime ); |
| } |
| event = eventRT; |
| |
| if (!m_enqueuedTasks.Push(task)) |
| { |
| CM_ASSERTMESSAGE("Error: Push enqueued tasks failure.") |
| hr = CM_FAILURE; |
| goto finish; |
| } |
| |
| CM_CHK_CMSTATUS_GOTOFINISH(FlushTaskWithoutSync()); |
| |
| finish: |
| if (hr != CM_SUCCESS) |
| { |
| CmTaskInternal::Destroy(task); |
| } |
| return hr; |
| } |
| |
| //*----------------------------------------------------------------------------- |
| //| Purpose: Create Event and Update event in m_eventArray |
| //| Returns: result of operation |
| //*----------------------------------------------------------------------------- |
| int32_t CmQueueRT::CreateEvent(CmTaskInternal *task, bool isVisible, int32_t &taskDriverId, CmEventRT *&event ) |
| { |
| int32_t hr = CM_SUCCESS; |
| |
| m_criticalSectionEvent.Acquire(); |
| |
| uint32_t freeSlotInEventArray = m_eventArray.GetFirstFreeIndex(); |
| |
| hr = CmEventRT::Create( freeSlotInEventArray, this, task, taskDriverId, m_device, isVisible, event ); |
| |
| if (hr == CM_SUCCESS) |
| { |
| |
| m_eventArray.SetElement( freeSlotInEventArray, event ); |
| m_eventCount ++; |
| |
| task->SetTaskEvent( event ); |
| |
| if(!isVisible) |
| { |
| event = nullptr; |
| } |
| |
| } |
| else |
| { |
| CM_ASSERTMESSAGE("Error: Create Event failure.") |
| } |
| |
| m_criticalSectionEvent.Release(); |
| |
| return hr; |
| } |
| |
| //*--------------------------------------------------------------------------------------------------------- |
| //| Name: EnqueueCopyCPUToGPUFullStride() |
| //| Purpose: Copy data from system memory to video memory (surface) |
| //| Arguments: |
| //| surface [in] Pointer to a CmSurface2D object as copy destination |
| //| sysMem [in] Pointer to a system memory as copy source |
| //| widthStride [in] Width stride in bytes for system memory (to calculate start of next line) |
| //| heightStride [in] Width stride in row for system memory (to calculate start of next plane) |
| //| option [in] Option passed from user, blocking copy, non-blocking copy or disable turbo boost |
| //| event [in,out] Reference to the pointer to Event |
| //| Returns: Result of the operation. |
| //| |
| //| Restrictions & Notes: |
| //| 1) sysMem must be 16-byte aligned. |
| //| 2) Surface's width must be 16-byte aligned regarding performance. |
| //| 3) widthStride and heightStride are used to indicate the padding information in system memory |
| //| widthStride = width_in_pixel * bytes_per_pixel + padding_in_bytes |
| //| heightStride = height + padding_in_row |
| //*--------------------------------------------------------------------------------------------------------- |
| CM_RT_API int32_t CmQueueRT::EnqueueCopyCPUToGPUFullStride( CmSurface2D* surface, |
| const unsigned char* sysMem, |
| const uint32_t widthStride, |
| const uint32_t heightStride, |
| const uint32_t option, |
| CmEvent* & event ) |
| { |
| INSERT_API_CALL_LOG(); |
| |
| if (!m_device->HasGpuCopyKernel()) |
| { |
| return CM_NOT_IMPLEMENTED; |
| } |
| |
| CmSurface2DRT *surfaceRT = static_cast<CmSurface2DRT *>(surface); |
| return EnqueueCopyInternal(surfaceRT, (unsigned char*)sysMem, widthStride, heightStride, CM_FASTCOPY_CPU2GPU, option, event); |
| } |
| |
| //*--------------------------------------------------------------------------------------------------------- |
| //| Name: EnqueueCopyGPUToCPUFullStride() |
| //| Purpose: Copy data from tiled video memory (surface) to linear system memory |
| //| Arguments: |
| //| surface [in] Pointer to a CmSurface2D object as copy source |
| //| sysMem [in] Pointer to a system memory as copy destination |
| //| widthStride [in] Width stride in bytes for system memory (to calculate start of next line) |
| //| heightStride [in] Width stride in row for system memory (to calculate start of next plane) |
| //| option [in] Option passed from user, blocking copy,non-blocking copy or disable turbo boost |
| //| event [in,out] Reference to the pointer to Event |
| //| Returns: Result of the operation. |
| //| |
| //| Restrictions & Notes: |
| //| 1) sysMem must be 16-byte aligned. |
| //| 2) Surface's width must be 16-byte aligned regarding performance. |
| //| 3) widthStride and heightStride are used to indicate the padding information in system memory |
| //| widthStride = width_in_pixel * bytes_per_pixel + padding_in_bytes |
| //| heightStride = height + padding_in_row |
| //*--------------------------------------------------------------------------------------------------------- |
| CM_RT_API int32_t CmQueueRT::EnqueueCopyGPUToCPUFullStride( CmSurface2D* surface, |
| unsigned char* sysMem, |
| const uint32_t widthStride, |
| const uint32_t heightStride, |
| const uint32_t option, |
| CmEvent* & event ) |
| { |
| INSERT_API_CALL_LOG(); |
| |
| if (!m_device->HasGpuCopyKernel()) |
| { |
| return CM_NOT_IMPLEMENTED; |
| } |
| |
| CmSurface2DRT *surfaceRT = static_cast<CmSurface2DRT *>(surface); |
| return EnqueueCopyInternal(surfaceRT, sysMem, widthStride, heightStride, CM_FASTCOPY_GPU2CPU, option, event); |
| } |
| |
| //*--------------------------------------------------------------------------------------------------------- |
| //| Name: CreateGPUCopyKernel() |
| //| Purpose: Create GPUCopy kernel, reuse the kernel if it has been created and resuable |
| //| Arguments: |
| //| widthInByte [in] surface's width in bytes |
| //| height [in] surface's height |
| //| format [in] surface's height |
| //| copyDirection [in] copy direction, cpu -> gpu or gpu -> cpu |
| //| gpuCopyKernelParam [out] kernel param |
| //| |
| //| Returns: Result of the operation. |
| //| |
| //*--------------------------------------------------------------------------------------------------------- |
| int32_t CmQueueRT::CreateGPUCopyKernel(uint32_t widthInByte, |
| uint32_t height, |
| CM_SURFACE_FORMAT format, |
| CM_GPUCOPY_DIRECTION copyDirection, |
| CM_GPUCOPY_KERNEL* &gpuCopyKernelParam) |
| { |
| int32_t hr = CM_SUCCESS; |
| |
| //Search existing kernel |
| CM_CHK_CMSTATUS_GOTOFINISH(SearchGPUCopyKernel(widthInByte, height, format, copyDirection, gpuCopyKernelParam)); |
| |
| if(gpuCopyKernelParam != nullptr) |
| { // reuse |
| GPUCOPY_KERNEL_LOCK(gpuCopyKernelParam); |
| } |
| else |
| { |
| gpuCopyKernelParam = new (std::nothrow) CM_GPUCOPY_KERNEL ; |
| CM_CHK_NULL_GOTOFINISH_CMERROR(gpuCopyKernelParam); |
| CmSafeMemSet(gpuCopyKernelParam, 0, sizeof(CM_GPUCOPY_KERNEL)); |
| |
| CM_CHK_CMSTATUS_GOTOFINISH(AllocateGPUCopyKernel(widthInByte, height, format, copyDirection, gpuCopyKernelParam->kernel)); |
| CM_CHK_CMSTATUS_GOTOFINISH(GetGPUCopyKrnID(widthInByte, height, format, copyDirection, gpuCopyKernelParam->kernelID)); |
| GPUCOPY_KERNEL_LOCK(gpuCopyKernelParam); |
| |
| CM_CHK_CMSTATUS_GOTOFINISH(AddGPUCopyKernel(gpuCopyKernelParam)); |
| } |
| |
| finish: |
| if( hr != CM_SUCCESS) |
| { |
| CmSafeDelete(gpuCopyKernelParam); |
| } |
| |
| return hr; |
| } |
| |
| //*--------------------------------------------------------------------------------------------------------- |
| //| Name: SearchGPUCopyKernel() |
| //| Purpose: Search if the required kernel exists |
| //| Arguments: |
| //| widthInByte [in] surface's width in bytes |
| //| height [in] surface's height |
| //| format [in] surface's height |
| //| copyDirection [in] copy direction, cpu -> gpu or gpu -> cpu |
| //| gpuCopyKernelParam [out] kernel param |
| //| |
| //| Returns: Result of the operation. |
| //| |
| //*--------------------------------------------------------------------------------------------------------- |
| int32_t CmQueueRT::SearchGPUCopyKernel(uint32_t widthInByte, |
| uint32_t height, |
| CM_SURFACE_FORMAT format, |
| CM_GPUCOPY_DIRECTION copyDirection, |
| CM_GPUCOPY_KERNEL* &kernelParam) |
| { |
| int32_t hr = CM_SUCCESS; |
| CM_GPUCOPY_KERNEL *gpucopyKernel = nullptr; |
| CM_GPUCOPY_KERNEL_ID kernelTypeID = GPU_COPY_KERNEL_UNKNOWN; |
| |
| kernelParam = nullptr; |
| CM_CHK_CMSTATUS_GOTOFINISH(GetGPUCopyKrnID(widthInByte, height, format, copyDirection, kernelTypeID)); |
| |
| for(uint32_t index =0 ; index< m_copyKernelParamArrayCount; index++) |
| { |
| gpucopyKernel = (CM_GPUCOPY_KERNEL*)m_copyKernelParamArray.GetElement(index); |
| if(gpucopyKernel != nullptr) |
| { |
| if(!gpucopyKernel->locked && |
| gpucopyKernel->kernelID == kernelTypeID) |
| { |
| kernelParam = gpucopyKernel; |
| break; |
| } |
| } |
| } |
| |
| finish: |
| return hr; |
| } |
| |
| //*--------------------------------------------------------------------------------------------------------- |
| //| Name: AddGPUCopyKernel() |
| //| Purpose: Add new kernel into m_copyKernelParamArray |
| //| Arguments: |
| //| widthInByte [in] surface's width in bytes |
| //| height [in] surface's height |
| //| format [in] surface's height |
| //| copyDirection [in] copy direction, cpu -> gpu or gpu -> cpu |
| //| gpuCopyKernelParam [out] kernel param |
| //| |
| //| Returns: Result of the operation. |
| //| |
| //*--------------------------------------------------------------------------------------------------------- |
| int32_t CmQueueRT::AddGPUCopyKernel(CM_GPUCOPY_KERNEL* &kernelParam) |
| { |
| int32_t hr = CM_SUCCESS; |
| // critical section protection |
| CLock locker(m_criticalSectionGPUCopyKrn); |
| |
| CM_CHK_NULL_GOTOFINISH(kernelParam, CM_INVALID_GPUCOPY_KERNEL); |
| |
| // the newly created kernel must be locked |
| if(!kernelParam->locked) |
| { |
| CM_ASSERTMESSAGE("Error: The newly created kernel must be locked.") |
| hr = CM_INVALID_GPUCOPY_KERNEL; |
| goto finish; |
| } |
| |
| m_copyKernelParamArray.SetElement(m_copyKernelParamArrayCount, kernelParam); |
| m_copyKernelParamArrayCount ++; |
| |
| finish: |
| return hr; |
| } |
| |
| //*--------------------------------------------------------------------------------------------------------- |
| //| Name: GetGPUCopyKrnID() |
| //| Purpose: Calculate the kernel ID accroding surface's width, height and copy direction |
| //| Arguments: |
| //| widthInByte [in] surface's width in bytes |
| //| height [in] surface's height |
| //| format [in] surface's height |
| //| copyDirection [in] copy direction, cpu -> gpu or gpu -> cpu |
| //| kernelID [out] kernel id |
| //| |
| //| Returns: Result of the operation. |
| //| |
| //*--------------------------------------------------------------------------------------------------------- |
| int32_t CmQueueRT::GetGPUCopyKrnID( uint32_t widthInByte, uint32_t height, CM_SURFACE_FORMAT format, |
| CM_GPUCOPY_DIRECTION copyDirection, CM_GPUCOPY_KERNEL_ID &kernelID ) |
| { |
| int32_t hr = CM_SUCCESS; |
| |
| kernelID = GPU_COPY_KERNEL_UNKNOWN; |
| |
| if (format == CM_SURFACE_FORMAT_NV12 || format == CM_SURFACE_FORMAT_P010 || format == CM_SURFACE_FORMAT_P016) |
| { |
| switch(copyDirection) |
| { |
| case CM_FASTCOPY_GPU2CPU: |
| if ( (height&0x7) ||(widthInByte&0x7f)) |
| { |
| kernelID = GPU_COPY_KERNEL_GPU2CPU_UNALIGNED_NV12_ID ; |
| } |
| else |
| { // height 8-row aligned, widthByte 128 multiple |
| kernelID = GPU_COPY_KERNEL_GPU2CPU_ALIGNED_NV12_ID ; |
| } |
| break; |
| |
| case CM_FASTCOPY_CPU2GPU: |
| kernelID = GPU_COPY_KERNEL_CPU2GPU_NV12_ID; |
| break; |
| |
| case CM_FASTCOPY_GPU2GPU: |
| kernelID = GPU_COPY_KERNEL_GPU2GPU_NV12_ID; |
| break; |
| |
| case CM_FASTCOPY_CPU2CPU: |
| kernelID = GPU_COPY_KERNEL_CPU2CPU_ID; |
| break; |
| |
| default : |
| CM_ASSERTMESSAGE("Error: Invalid fast copy direction.") |
| hr = CM_FAILURE; |
| break; |
| } |
| } |
| else |
| { |
| switch(copyDirection) |
| { |
| case CM_FASTCOPY_GPU2CPU: |
| if ( (height&0x7) ||(widthInByte&0x7f)) |
| { |
| kernelID = GPU_COPY_KERNEL_GPU2CPU_UNALIGNED_ID; |
| } |
| else |
| { // height 8-row aligned, widthByte 128 multiple |
| kernelID = GPU_COPY_KERNEL_GPU2CPU_ALIGNED_ID; |
| } |
| break; |
| |
| case CM_FASTCOPY_CPU2GPU: |
| kernelID = GPU_COPY_KERNEL_CPU2GPU_ID; |
| break; |
| |
| case CM_FASTCOPY_GPU2GPU: |
| kernelID = GPU_COPY_KERNEL_GPU2GPU_ID; |
| break; |
| |
| case CM_FASTCOPY_CPU2CPU: |
| kernelID = GPU_COPY_KERNEL_CPU2CPU_ID; |
| break; |
| |
| default : |
| CM_ASSERTMESSAGE("Error: Invalid fast copy direction.") |
| hr = CM_FAILURE; |
| break; |
| } |
| } |
| |
| return hr; |
| } |
| |
| //*--------------------------------------------------------------------------------------------------------- |
| //| Name: AllocateGPUCopyKernel() |
| //| Purpose: Allocate GPUCopy Kernel |
| //| Arguments: |
| //| widthInByte [in] surface's width in bytes |
| //| height [in] surface's height |
| //| format [in] surface's height |
| //| copyDirection [in] copy direction, cpu -> gpu or gpu -> cpu |
| //| kernel [out] pointer to created kernel |
| //| |
| //| Returns: Result of the operation. |
| //| |
| //*--------------------------------------------------------------------------------------------------------- |
| int32_t CmQueueRT::AllocateGPUCopyKernel( uint32_t widthInByte, uint32_t height, CM_SURFACE_FORMAT format, |
| CM_GPUCOPY_DIRECTION copyDirection, CmKernel *&kernel ) |
| { |
| int32_t hr = CM_SUCCESS; |
| CmProgram *gpuCopyProgram = nullptr; |
| |
| CM_CHK_CMSTATUS_GOTOFINISH( m_device->LoadPredefinedCopyKernel(gpuCopyProgram)); |
| CM_CHK_NULL_GOTOFINISH_CMERROR(gpuCopyProgram); |
| |
| if (format == CM_SURFACE_FORMAT_NV12 || format == CM_SURFACE_FORMAT_P010 || format == CM_SURFACE_FORMAT_P016) |
| { |
| switch(copyDirection) |
| { |
| case CM_FASTCOPY_GPU2CPU: |
| if ( (height&0x7) ||(widthInByte&0x7f)) |
| { |
| CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateKernel( gpuCopyProgram, _NAME( surfaceCopy_read_NV12_32x32 ) , kernel,"PredefinedGPUCopyKernel")); |
| } |
| else |
| { // height 8-row aligned, widthByte 128 multiple |
| CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateKernel( gpuCopyProgram, _NAME( surfaceCopy_read_NV12_aligned_32x32 ) , kernel,"PredefinedGPUCopyKernel")); |
| } |
| break; |
| |
| case CM_FASTCOPY_CPU2GPU: |
| CM_CHK_CMSTATUS_GOTOFINISH( m_device->CreateKernel( gpuCopyProgram, _NAME( surfaceCopy_write_NV12_32x32 ), kernel, "PredefinedGPUCopyKernel")); |
| break; |
| |
| case CM_FASTCOPY_GPU2GPU: |
| CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateKernel(gpuCopyProgram, _NAME(SurfaceCopy_2DTo2D_NV12_32x32), kernel, "PredefinedGPUCopyKernel")); |
| break; |
| |
| case CM_FASTCOPY_CPU2CPU: |
| CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateKernel(gpuCopyProgram, _NAME(SurfaceCopy_BufferToBuffer_4k), kernel, "PredefinedGPUCopyKernel")); |
| break; |
| |
| default : |
| CM_ASSERTMESSAGE("Error: Invalid fast copy direction.") |
| hr = CM_FAILURE; |
| break; |
| } |
| } |
| else |
| { |
| switch(copyDirection) |
| { |
| case CM_FASTCOPY_GPU2CPU: |
| if ( (height&0x7) ||(widthInByte&0x7f)) |
| { |
| CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateKernel( gpuCopyProgram, _NAME( surfaceCopy_read_32x32 ) , kernel, "PredefinedGPUCopyKernel")); |
| } |
| else |
| { // height 8-row aligned, widthByte 128 multiple |
| CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateKernel( gpuCopyProgram, _NAME( surfaceCopy_read_aligned_32x32 ) , kernel, "PredefinedGPUCopyKernel")); |
| } |
| break; |
| |
| case CM_FASTCOPY_CPU2GPU: |
| CM_CHK_CMSTATUS_GOTOFINISH( m_device->CreateKernel( gpuCopyProgram, _NAME( surfaceCopy_write_32x32 ), kernel, "PredefinedGPUCopyKernel" )); |
| break; |
| |
| case CM_FASTCOPY_GPU2GPU: |
| CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateKernel(gpuCopyProgram, _NAME(SurfaceCopy_2DTo2D_32x32), kernel, "PredefinedGPUCopyKernel")); |
| break; |
| |
| case CM_FASTCOPY_CPU2CPU: |
| CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateKernel(gpuCopyProgram, _NAME(SurfaceCopy_BufferToBuffer_4k), kernel, "PredefinedGPUCopyKernel")); |
| break; |
| |
| default : |
| CM_ASSERTMESSAGE("Error: Invalid fast copy direction.") |
| hr = CM_FAILURE; |
| break; |
| } |
| } |
| |
| finish: |
| return hr; |
| } |
| |
| CM_RT_API int32_t CmQueueRT::EnqueueFast(CmTask *task, |
| CmEvent* &event, |
| const CmThreadSpace *threadSpace) |
| { |
| CM_HAL_STATE * state = ((PCM_CONTEXT_DATA)m_device->GetAccelData())->cmHalState; |
| int32_t result = CM_SUCCESS; |
| if (state == nullptr || state->advExecutor == nullptr) |
| { |
| result = CM_NULL_POINTER; |
| } |
| else |
| { |
| const CmThreadSpaceRT *threadSpaceRTConst |
| = static_cast<const CmThreadSpaceRT*>(threadSpace); |
| if (state->cmHalInterface->CheckMediaModeAvailability() == false) |
| { |
| uint32_t old_stream_idx = state->osInterface->streamIndex; |
| state->osInterface->streamIndex = m_streamIndex; |
| if (threadSpaceRTConst != nullptr) |
| { |
| result = state->advExecutor->SubmitComputeTask( |
| this, task, event, |
| threadSpaceRTConst->GetThreadGroupSpace(), |
| (MOS_GPU_CONTEXT)m_queueOption.GPUContext); |
| } |
| else |
| { |
| result = state->advExecutor->SubmitComputeTask( |
| this, task, event, nullptr, |
| (MOS_GPU_CONTEXT)m_queueOption.GPUContext); |
| } |
| state->osInterface->streamIndex = old_stream_idx; |
| } |
| else |
| { |
| result = state->advExecutor->SubmitTask( |
| this, task, event, threadSpace, |
| (MOS_GPU_CONTEXT)m_queueOption.GPUContext); |
| } |
| } |
| return result; |
| } |
| |
| CM_RT_API int32_t CmQueueRT::DestroyEventFast(CmEvent *&event) |
| { |
| CM_HAL_STATE * state = ((PCM_CONTEXT_DATA)m_device->GetAccelData())->cmHalState; |
| |
| if (state == nullptr || state->advExecutor == nullptr) |
| { |
| return CM_NULL_POINTER; |
| } |
| else |
| { |
| return state->advExecutor->DestoryEvent(this, event); |
| } |
| } |
| |
| CM_RT_API int32_t |
| CmQueueRT::EnqueueWithGroupFast(CmTask *task, |
| CmEvent* &event, |
| const CmThreadGroupSpace *threadGroupSpace) |
| { |
| CM_HAL_STATE * state = ((PCM_CONTEXT_DATA)m_device->GetAccelData())->cmHalState; |
| int32_t result = CM_SUCCESS; |
| if (state == nullptr || state->advExecutor == nullptr) |
| { |
| return CM_NULL_POINTER; |
| } |
| |
| uint32_t old_stream_idx = state->osInterface->streamIndex; |
| state->osInterface->streamIndex = m_streamIndex; |
| result = state->advExecutor->SubmitComputeTask( |
| this, task, event, threadGroupSpace, |
| (MOS_GPU_CONTEXT)m_queueOption.GPUContext); |
| state->osInterface->streamIndex = old_stream_idx; |
| return result; |
| } |
| |
| int32_t CmQueueRT::GetOSSyncEventHandle(void *& hOSSyncEvent) |
| { |
| hOSSyncEvent = m_osSyncEvent; |
| return CM_SUCCESS; |
| } |
| |
| |
| int32_t CmQueueRT::RegisterSyncEvent() |
| { |
| CM_RETURN_CODE hr = CM_SUCCESS; |
| |
| CM_HAL_OSSYNC_PARAM syncParam; |
| void *syncEventHandle = nullptr; |
| syncParam.osSyncEvent = syncEventHandle; |
| |
| PCM_CONTEXT_DATA cmData = (PCM_CONTEXT_DATA)m_device->GetAccelData(); |
| PCM_HAL_STATE cmHalState = cmData->cmHalState; |
| // Call HAL layer to wait for Task finished with event-driven mechanism |
| CM_CHK_MOSSTATUS_GOTOFINISH_CMERROR(cmHalState->pfnRegisterUMDNotifyEventHandle(cmHalState, &syncParam)); |
| |
| m_osSyncEvent = syncParam.osSyncEvent; |
| |
| finish: |
| return hr; |
| } |
| |
| MOS_STATUS CmQueueRT::CreateGpuContext(CM_HAL_STATE *halState, |
| MOS_GPU_CONTEXT gpuContextName, |
| MOS_GPU_NODE gpuNode, |
| MOS_GPUCTX_CREATOPTIONS *createOptions) |
| { |
| uint32_t old_stream_idx = 0; |
| if (MOS_GPU_CONTEXT_CM_COMPUTE == gpuContextName) |
| { |
| m_streamIndex = halState->pfnRegisterStream(halState); |
| old_stream_idx = halState->osInterface->streamIndex; |
| halState->osInterface->streamIndex = m_streamIndex; |
| } |
| else |
| { // As there is only one render context, the original stream index will be used. |
| old_stream_idx = m_streamIndex = halState->osInterface->streamIndex; |
| } |
| MOS_STATUS status = halState->pfnCreateGPUContext(halState, |
| gpuContextName, gpuNode, |
| createOptions); |
| halState->osInterface->streamIndex = old_stream_idx; |
| return status; |
| } |
| |
| MOS_STATUS CmQueueRT::ExecuteGroupTask(CM_HAL_STATE *halState, |
| CM_HAL_EXEC_TASK_GROUP_PARAM *taskParam, |
| MOS_GPU_CONTEXT gpuContextName) |
| { |
| uint32_t old_stream_idx = halState->osInterface->streamIndex; |
| halState->osInterface->streamIndex = m_streamIndex; |
| MOS_STATUS result |
| = halState->osInterface->pfnSetGpuContext(halState->osInterface, |
| gpuContextName); |
| if (MOS_STATUS_SUCCESS != result) |
| { |
| return result; |
| } |
| RegisterSyncEvent(); |
| result = halState->pfnExecuteGroupTask(halState, taskParam); |
| halState->osInterface->streamIndex = old_stream_idx; |
| return result; |
| } |
| } // namespace |