media_driver/agnostic/common/cm/cm_queue_rt.cpp - third_party/github.com/intel/media-driver - Git at Google

 /*
 * Copyright (c) 2007-2017, Intel Corporation
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included
 * in all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 * OTHER DEALINGS IN THE SOFTWARE.
 */
 //!
 //! \file      cm_queue_rt.cpp
 //! \brief     Contains CmQueueRT implementations.
 //!

 #include "cm_queue_rt.h"

 #include "cm_mem.h"
 #include "cm_device_rt.h"
 #include "cm_event_rt.h"
 #include "cm_task_rt.h"
 #include "cm_task_internal.h"
 #include "cm_thread_space_rt.h"
 #include "cm_kernel_rt.h"
 #include "cm_kernel_data.h"
 #include "cm_buffer_rt.h"
 #include "cm_group_space.h"
 #include "cm_vebox_data.h"
 #include "cm_surface_manager.h"
 #include "cm_surface_2d_rt.h"
 #include "cm_vebox_rt.h"
 #include "cm_execution_adv.h"

 // Used by GPUCopy
 #define BLOCK_PIXEL_WIDTH            (32)
 #define BLOCK_HEIGHT                 (8)
 #define BLOCK_HEIGHT_NV12            (4)
 #define SUB_BLOCK_PIXEL_WIDTH        (8)
 #define SUB_BLOCK_HEIGHT             (8)
 #define SUB_BLOCK_HEIGHT_NV12        (4)
 #define INNER_LOOP                   (4)
 #define BYTE_COPY_ONE_THREAD         (1024*INNER_LOOP)  //4K for each thread
 #define THREAD_SPACE_WIDTH_INCREMENT (8)
 //Used by unaligned copy
 #define BLOCK_WIDTH                  (64)
 #define PAGE_ALIGNED                 (0x1000)

 #define GPUCOPY_KERNEL_LOCK(a) ((a)->locked = true)
 #define GPUCOPY_KERNEL_UNLOCK(a) ((a)->locked = false)

 namespace CMRT_UMD
 {
 //*-----------------------------------------------------------------------------
 //| Purpose:    Create Queue
 //| Returns:    Result of the operation.
 //*-----------------------------------------------------------------------------
 int32_t CmQueueRT::Create(CmDeviceRT *device,
                           CmQueueRT* &queue,
                           CM_QUEUE_CREATE_OPTION queueCreateOption)
 {
     int32_t result = CM_SUCCESS;
     queue = new (std::nothrow) CmQueueRT(device, queueCreateOption);
     if( queue )
     {
         result = queue->Initialize( );
         if( result != CM_SUCCESS )
         {
             CmQueueRT::Destroy( queue);
         }
     }
     else
     {
         CM_ASSERTMESSAGE("Error: Failed to create CmQueue due to out of system memory.");
         result = CM_OUT_OF_HOST_MEMORY;
     }
     return result;
 }

 //*-----------------------------------------------------------------------------
 //| Purpose:    Destroy Queue
 //| Returns:    Result of the operation.
 //*-----------------------------------------------------------------------------
 int32_t CmQueueRT::Destroy(CmQueueRT* &queue )
 {
     if( queue == nullptr )
     {
         return CM_FAILURE;
     }

     uint32_t result = queue->CleanQueue();
     CmSafeDelete( queue );

     return result;
 }

 //*-----------------------------------------------------------------------------
 //| Purpose:    Constructor of Cm Queue
 //| Returns:    Result of the operation.
 //*-----------------------------------------------------------------------------
 CmQueueRT::CmQueueRT(CmDeviceRT *device,
                      CM_QUEUE_CREATE_OPTION queueCreateOption):
     m_device(device),
     m_eventArray(CM_INIT_EVENT_COUNT),
     m_eventCount(0),
     m_copyKernelParamArray(CM_INIT_GPUCOPY_KERNL_COUNT),
     m_copyKernelParamArrayCount(0),
     m_halMaxValues(nullptr),
     m_queueOption(queueCreateOption),
     m_usingVirtualEngine(false),
     m_osSyncEvent(nullptr),
     m_trackerIndex(0),
     m_fastTrackerIndex(0),
     m_streamIndex(0)
 {
     MOS_ZeroMemory(&m_mosVeHintParams, sizeof(m_mosVeHintParams));
 }

 //*-----------------------------------------------------------------------------
 //| Purpose:    Destructor of Cm Queue
 //| Returns:    Result of the operation.
 //*-----------------------------------------------------------------------------
 CmQueueRT::~CmQueueRT()
 {
     m_osSyncEvent = nullptr;
     uint32_t eventArrayUsedSize = m_eventArray.GetMaxSize();
     for( uint32_t i = 0; i < eventArrayUsedSize; i ++ )
     {
         CmEventRT* event = (CmEventRT*)m_eventArray.GetElement( i );
         uint32_t eventReleaseTimes = 0;
         while( event )
         {   // destroy the event no matter if it is released by user
             if(eventReleaseTimes > 2)
             {
                 // The max of event's reference cout is 2
                 // if the event is not released after 2 times, there is something wrong
                 CM_ASSERTMESSAGE("Error: The max of event's reference cout is 2.");
                 break;
             }
             CmEventRT::Destroy( event );
             eventReleaseTimes ++;
         }
     }
     m_eventArray.Delete();

     // Do not destroy the kernel in m_copyKernelParamArray.
     // They have been destoyed in ~CmDevice() before destroying Queue
     for( uint32_t i = 0; i < m_copyKernelParamArrayCount; i ++ )
     {
         CM_GPUCOPY_KERNEL *gpuCopyParam = (CM_GPUCOPY_KERNEL*)m_copyKernelParamArray.GetElement( i );
         CmSafeDelete(gpuCopyParam);
     }

     m_copyKernelParamArray.Delete();

 }

 //*-----------------------------------------------------------------------------
 //| Purpose:    Initialize Cm Queue
 //| Returns:    Result of the operation.
 //*-----------------------------------------------------------------------------
 int32_t CmQueueRT::Initialize()
 {
     PCM_HAL_STATE cmHalState = ((PCM_CONTEXT_DATA)m_device->GetAccelData())->cmHalState;
     CM_HAL_MAX_VALUES_EX* halMaxValuesEx = nullptr;
     CM_RETURN_CODE hr = CM_SUCCESS;
     m_device->GetHalMaxValues(m_halMaxValues, halMaxValuesEx);

     // Assign a new tracker and record the tracker index
     int ret = cmHalState->renderHal->trackerProducer.AssignNewTracker();
     CM_CHK_COND_RETURN((ret < 0), CM_FAILURE, "Error: failed to assign a new tracker");
     m_trackerIndex = ret;
     ret = cmHalState->advExecutor->AssignNewTracker();
     CM_CHK_COND_RETURN((ret < 0), CM_FAILURE, "Error: failed to assign a new tracker");
     m_fastTrackerIndex = ret;

     // Creates or gets GPU Context for the test
     if (m_queueOption.UserGPUContext == true)
     {
         // Checks if it is the user-provided GPU context. If it is valid, we will create the queue with the existing Context
         if (cmHalState->osInterface->pfnIsGpuContextValid(cmHalState->osInterface, (MOS_GPU_CONTEXT)m_queueOption.GPUContext) != MOS_STATUS_SUCCESS)
         {
             // Returns failure
             CM_ASSERTMESSAGE("Error: The user passed in an GPU context which is not valid");
             return CM_INVALID_USER_GPU_CONTEXT_FOR_QUEUE_EX;
         }
     }
     else
     {
         MOS_GPUCTX_CREATOPTIONS ctxCreateOption;
         ctxCreateOption.CmdBufferNumScale
           = HalCm_GetNumCmdBuffers(cmHalState->osInterface, cmHalState->cmDeviceParam.maxTasks);

         // Create MDF preset GPU context, update GPUContext in m_queueOption
         if (m_queueOption.QueueType == CM_QUEUE_TYPE_RENDER)
         {
             MOS_GPU_CONTEXT tmpGpuCtx = cmHalState->requestCustomGpuContext? MOS_GPU_CONTEXT_RENDER4: MOS_GPU_CONTEXT_RENDER3;;

             // check if context handle was specified by user.
             if (m_queueOption.GPUContext != 0)
             {
                 tmpGpuCtx = (MOS_GPU_CONTEXT)m_queueOption.GPUContext;
             }

             // sanity check of context handle for CM
             if (HalCm_IsValidGpuContext(tmpGpuCtx) == false)
             {
                 return CM_INVALID_USER_GPU_CONTEXT_FOR_QUEUE_EX;
             }

             // SSEU overriding
             if (cmHalState->cmHalInterface->IsOverridePowerOptionPerGpuContext())
             {
                 // checking if need shutdown sub-slices for VME usage
                 if (m_queueOption.SseuUsageHint == CM_QUEUE_SSEU_USAGE_HINT_VME
                  && cmHalState->cmHalInterface->IsRequestShutdownSubslicesForVmeUsage())
                 {
                     MEDIA_SYSTEM_INFO *gtSystemInfo = cmHalState->osInterface->pfnGetGtSystemInfo(cmHalState->osInterface);
                     ctxCreateOption.packed.SliceCount    = (uint8_t)gtSystemInfo->SliceCount;
                     ctxCreateOption.packed.SubSliceCount = (gtSystemInfo->SubSliceCount / gtSystemInfo->SliceCount) >> 1; // set to half
                     ctxCreateOption.packed.MaxEUcountPerSubSlice = gtSystemInfo->EUCount/gtSystemInfo->SubSliceCount;
                     ctxCreateOption.packed.MinEUcountPerSubSlice = gtSystemInfo->EUCount/gtSystemInfo->SubSliceCount;
                 }

 #if (_DEBUG || _RELEASE_INTERNAL)
                 MOS_USER_FEATURE_VALUE_DATA UserFeatureData = {0};
                 MOS_UserFeature_ReadValue_ID(
                     nullptr,
                     __MEDIA_USER_FEATURE_VALUE_SSEU_SETTING_OVERRIDE_ID,
                     &UserFeatureData);

                 // +---------------+----------------+----------------+----------------+
                 // |   EUCountMax  |   EUCountMin   |     SSCount    |   SliceCount   |
                 // +-------------24+--------------16+---------------8+---------------0+
                 if (UserFeatureData.u32Data != 0xDEADC0DE)
                 {
                     ctxCreateOption.packed.SliceCount            = UserFeatureData.u32Data         & 0xFF;       // Bits 0-7
                     ctxCreateOption.packed.SubSliceCount         = (UserFeatureData.u32Data >>  8) & 0xFF;       // Bits 8-15
                     ctxCreateOption.packed.MaxEUcountPerSubSlice = (UserFeatureData.u32Data >> 16) & 0xFF;       // Bits 16-23
                     ctxCreateOption.packed.MinEUcountPerSubSlice = (UserFeatureData.u32Data >> 24) & 0xFF;       // Bits 24-31
                 }
 #endif
             }

             ctxCreateOption.RAMode = m_queueOption.RAMode;

             // Create render GPU context.
             CM_CHK_MOSSTATUS_GOTOFINISH_CMERROR(
                 CreateGpuContext(cmHalState, tmpGpuCtx, MOS_GPU_NODE_3D,
                                  &ctxCreateOption));

 #if (_RELEASE_INTERNAL || _DEBUG)
 #if defined(CM_DIRECT_GUC_SUPPORT)
             //init GuC
             CM_CHK_MOSSTATUS_GOTOFINISH_CMERROR(cmHalState->osInterface->pfnInitGuC(cmHalState->osInterface, MOS_GPU_NODE_3D));
 #endif
 #endif
             m_queueOption.GPUContext = tmpGpuCtx;
         }
         else if (m_queueOption.QueueType == CM_QUEUE_TYPE_COMPUTE)
         {
             ctxCreateOption.RAMode = m_queueOption.RAMode;

             bool bVeUsedInCm = false; //need change to true once feature is done in future.
 #if (_DEBUG || _RELEASE_INTERNAL)
             MOS_USER_FEATURE_VALUE_DATA UserFeatureData = {0};
             MOS_UserFeature_ReadValue_ID(nullptr,
                 __MEDIA_USER_FEATURE_VALUE_MDF_CCS_USE_VE_INTERFACE, &UserFeatureData);
             bVeUsedInCm = (UserFeatureData.u32Data == 0x1)? true: false;
 #endif
             Mos_SetVirtualEngineSupported(cmHalState->osInterface, bVeUsedInCm);

             if (cmHalState->osInterface->veDefaultEnable && cmHalState->osInterface->bSupportVirtualEngine) // check if VE enabled on OS
             {
                 // prepare virtual egine hint param on this cm queue.
                 CM_CHK_MOSSTATUS_GOTOFINISH_CMERROR(
                     HalCm_PrepareVEHintParam(cmHalState, false, &m_mosVeHintParams));

                 m_usingVirtualEngine = true;
             }

             CM_CHK_MOSSTATUS_GOTOFINISH_CMERROR(
                 CreateGpuContext(cmHalState, MOS_GPU_CONTEXT_CM_COMPUTE,
                                  MOS_GPU_NODE_COMPUTE, &ctxCreateOption));
             m_queueOption.GPUContext = MOS_GPU_CONTEXT_CM_COMPUTE;
         }
         else
         {
             // Returns failure
             CM_ASSERTMESSAGE("Error: The QueueType is not supported by MDF.");
             return CM_NOT_IMPLEMENTED;
         }
     }

 finish:
     return hr;
 }

 //*-----------------------------------------------------------------------------
 //| Purpose:    Checks whether any kernels in the task have a thread argument
 //| Returns:    Result of the operation.
 //*-----------------------------------------------------------------------------
 int32_t CmQueueRT::GetTaskHasThreadArg(CmKernelRT* kernelArray[], uint32_t numKernels, bool& threadArgExists)
 {
     threadArgExists = false;

     for(uint32_t krn = 0; krn < numKernels; krn++)
     {
         if( !kernelArray[krn] )
         {
             CM_ASSERTMESSAGE("Error: The kernel in the task have no thread argument.");
             return CM_FAILURE;
         }

         if( kernelArray[krn]->IsThreadArgExisted( ) )
         {
             threadArgExists = true;
             break;
         }
     }

     return CM_SUCCESS;
 }

 //*-----------------------------------------------------------------------------
 //| Purpose:    Enqueue Task
 //| Arguments :
 //|               kernelArray      [in]       Pointer to kernel array
 //|               event            [in]       Reference to the pointer to Event
 //|               threadSpace               [out]      Pointer to thread space
 //|
 //| Returns:    Result of the operation.
 //*-----------------------------------------------------------------------------
 CM_RT_API int32_t CmQueueRT::Enqueue(
            CmTask* kernelArray,
            CmEvent* & event,
            const CmThreadSpace* threadSpace)
 {
     INSERT_API_CALL_LOG();

     if (kernelArray == nullptr)
     {
         CM_ASSERTMESSAGE("Error: Kernel array is null.");
         return CM_INVALID_ARG_VALUE;
     }

     CmTaskRT *kernelArrayRT = static_cast<CmTaskRT *>(kernelArray);
     uint32_t kernelCount = 0;
     kernelCount = kernelArrayRT->GetKernelCount();
     if (kernelCount == 0)
     {
         CM_ASSERTMESSAGE("Error: Invalid kernel count.");
         return CM_FAILURE;
     }

     if (kernelCount > m_halMaxValues->maxKernelsPerTask)
     {
         CM_ASSERTMESSAGE("Error: Kernel count exceeds max kernel per enqueue.");
         return CM_EXCEED_MAX_KERNEL_PER_ENQUEUE;
     }

     int32_t result;
     const CmThreadSpaceRT *threadSpaceRTConst = static_cast<const CmThreadSpaceRT *>(threadSpace);
     PCM_HAL_STATE cmHalState = ((PCM_CONTEXT_DATA)m_device->GetAccelData())->cmHalState;
     if (cmHalState->cmHalInterface->CheckMediaModeAvailability() == false)
     {
         if (threadSpaceRTConst != nullptr)
         {
             result = EnqueueWithGroup(kernelArray, event, threadSpaceRTConst->GetThreadGroupSpace());
         }
         else
         {
             // If there isn't any shared thread space or associated thread space,
             // create a temporary (maxThreadCount x 1) thread group space whose
             // size equal to the max thread count of kernel who doesn't have a
             // thread space associated.
             uint32_t maxThreadCount = 1;
             bool usedCommonTGS = false;
             for (uint32_t i = 0; i < kernelCount; i++)
             {
                 CmKernelRT *tmpKernel = kernelArrayRT->GetKernelPointer(i);
                 CmThreadGroupSpace *tmpTGS = nullptr;
                 tmpKernel->GetThreadGroupSpace(tmpTGS);

                 if (tmpTGS == nullptr)
                 {
                     usedCommonTGS = true;
                     uint32_t singleThreadCount = 0;
                     tmpKernel->GetThreadCount(singleThreadCount);
                     if (maxThreadCount < singleThreadCount)
                     {
                         maxThreadCount = singleThreadCount;
                     }
                 }
             }

             CmThreadGroupSpace *threadGroupSpaceTemp = nullptr;
             if (usedCommonTGS == true)
             {
                 result = m_device->CreateThreadGroupSpace(1, 1, maxThreadCount, 1, threadGroupSpaceTemp);
                 if (result != CM_SUCCESS)
                 {
                     CM_ASSERTMESSAGE("Error: Creating temporary thread group space failure.");
                     return result;
                 }
             }

             result = EnqueueWithGroup(kernelArray, event, threadGroupSpaceTemp);

             if (threadGroupSpaceTemp != nullptr)
             {
                 m_device->DestroyThreadGroupSpace(threadGroupSpaceTemp);
             }
         }
         return result;
     }

     if (threadSpaceRTConst && threadSpaceRTConst->IsThreadAssociated())
     {
         if (threadSpaceRTConst->GetNeedSetKernelPointer() && threadSpaceRTConst->KernelPointerIsNULL())
         {
             CmKernelRT* tmp = nullptr;
             tmp = kernelArrayRT->GetKernelPointer(0);
             threadSpaceRTConst->SetKernelPointer(tmp);
         }
     }

 #if _DEBUG
     if (threadSpaceRTConst)
     {
         CmThreadSpaceRT *threadSpaceRT = const_cast<CmThreadSpaceRT*>(threadSpaceRTConst);
         if (!threadSpaceRT->IntegrityCheck(kernelArrayRT))
         {
             CM_ASSERTMESSAGE("Error: Invalid thread space.");
             return CM_INVALID_THREAD_SPACE;
         }
     }
 #endif

     if(m_device->IsPrintEnable())
     {
         m_device->ClearPrintBuffer();
     }

     typedef CmKernelRT* pCmKernel;
     CmKernelRT** tmp = MOS_NewArray(pCmKernel, (kernelCount + 1));
     if(tmp == nullptr)
     {
         CM_ASSERTMESSAGE("Error: Out of system memory.");
         return CM_OUT_OF_HOST_MEMORY;
     }

     uint32_t totalThreadNumber = 0;
     for(uint32_t i = 0; i < kernelCount; i++)
     {
         tmp[ i ] = kernelArrayRT->GetKernelPointer(i);

         uint32_t singleThreadNumber = 0;
         tmp[i]->GetThreadCount(singleThreadNumber);
         if (singleThreadNumber == 0)
         {
             CmThreadSpaceRT *threadSpaceRT = const_cast<CmThreadSpaceRT*>(threadSpaceRTConst);
             if (threadSpaceRT)
             {
                 uint32_t width, height;
                 threadSpaceRT->GetThreadSpaceSize(width, height);
                 singleThreadNumber = width*height;
             }
         }
         totalThreadNumber += singleThreadNumber;
     }
     tmp[kernelCount ] = nullptr;

     CmEventRT *eventRT = static_cast<CmEventRT *>(event);
     result = Enqueue_RT(tmp, kernelCount, totalThreadNumber, eventRT, threadSpaceRTConst, kernelArrayRT->GetSyncBitmap(), kernelArrayRT->GetPowerOption(),
                         kernelArrayRT->GetConditionalEndBitmap(), kernelArrayRT->GetConditionalEndInfo(), kernelArrayRT->GetTaskConfig());

     if (eventRT)
     {
         eventRT->SetKernelNames(kernelArrayRT, const_cast<CmThreadSpaceRT*>(threadSpaceRTConst), nullptr);
     }

     event = eventRT;
     MosSafeDeleteArray( tmp );

     return result;
 }

 //*-----------------------------------------------------------------------------
 //| Purpose:      Enqueue Task
 //| Arguments :
 //|               kernelArray      [in]       Pointer to kernel array
 //|               event            [in]       Reference to the pointer to Event
 //|               threadSpace               [out]      Pointer to thread space
 //|
 //| Returns:    Result of the operation.
 //*-----------------------------------------------------------------------------
 int32_t CmQueueRT::Enqueue_RT(
                         CmKernelRT* kernelArray[],
                         const uint32_t kernelCount,
                         const uint32_t totalThreadCount,
                         CmEventRT* & event,
                         const CmThreadSpaceRT* threadSpace,
                         uint64_t    syncBitmap,
                         PCM_POWER_OPTION powerOption,
                         uint64_t    conditionalEndBitmap,
                         CM_HAL_CONDITIONAL_BB_END_INFO* conditionalEndInfo,
                         PCM_TASK_CONFIG  taskConfig)
 {
     if(kernelArray == nullptr)
     {
         CM_ASSERTMESSAGE("Error: Kernel array is NULL.");
         return CM_INVALID_ARG_VALUE;
     }

     if( kernelCount == 0 )
     {
         CM_ASSERTMESSAGE("Error: There are no valid kernels.");
         return CM_INVALID_ARG_VALUE;
     }

     bool isEventVisible = (event == CM_NO_EVENT)? false:true;

     CLock Locker(m_criticalSectionTaskInternal);

     // set the current tracker index in renderhal
     PCM_CONTEXT_DATA cmData = (PCM_CONTEXT_DATA)m_device->GetAccelData();
     CM_CHK_NULL_RETURN_CMERROR(cmData);
     CM_CHK_NULL_RETURN_CMERROR(cmData->cmHalState);
     CM_CHK_NULL_RETURN_CMERROR(cmData->cmHalState->renderHal);
     cmData->cmHalState->renderHal->currentTrackerIndex = m_trackerIndex;

     CmTaskInternal* task = nullptr;
     int32_t result = CmTaskInternal::Create(kernelCount, totalThreadCount, kernelArray, threadSpace, m_device, syncBitmap, task, conditionalEndBitmap, conditionalEndInfo);
     if( result != CM_SUCCESS )
     {
         CM_ASSERTMESSAGE("Error: Create CM task internal failure.");
         return result;
     }

     LARGE_INTEGER nEnqueueTime;
     if ( !(MOS_QueryPerformanceCounter( (uint64_t*)&nEnqueueTime.QuadPart )))
     {
         CM_ASSERTMESSAGE("Error: Query performance counter failure.");
         CmTaskInternal::Destroy(task);
         return CM_FAILURE;
     }

     int32_t taskDriverId = -1;

     result = CreateEvent(task, isEventVisible, taskDriverId, event);
     if (result != CM_SUCCESS)
     {
         CM_ASSERTMESSAGE("Error: Create event failure.");
         return result;
     }
     if ( event != nullptr )
     {
         event->SetEnqueueTime( nEnqueueTime );
     }

     task->SetPowerOption( powerOption );

     task->SetProperty(taskConfig);

     if( !m_enqueuedTasks.Push( task ) )
     {
         CM_ASSERTMESSAGE("Error: Push enqueued tasks failure.");
         return CM_FAILURE;
     }

     result = FlushTaskWithoutSync();

     return result;
 }

 int32_t CmQueueRT::Enqueue_RT(CmKernelRT* kernelArray[],
                         const uint32_t kernelCount,
                         const uint32_t totalThreadCount,
                         CmEventRT* & event,
                         const CmThreadGroupSpace* threadGroupSpace,
                         uint64_t    syncBitmap,
                         PCM_POWER_OPTION powerOption,
                         uint64_t    conditionalEndBitmap,
                         CM_HAL_CONDITIONAL_BB_END_INFO* conditionalEndInfo,
                         PCM_TASK_CONFIG  taskConfig,
                         const CM_EXECUTION_CONFIG* krnExecCfg)
 {
     if(kernelArray == nullptr)
     {
         CM_ASSERTMESSAGE("Error: Kernel array is NULL.");
         return CM_INVALID_ARG_VALUE;
     }

     if( kernelCount == 0 )
     {
         CM_ASSERTMESSAGE("Error: There are no valid kernels.");
         return CM_INVALID_ARG_VALUE;
     }

     CLock Locker(m_criticalSectionTaskInternal);

     // set the current tracker index in renderhal
     PCM_CONTEXT_DATA cmData = (PCM_CONTEXT_DATA)m_device->GetAccelData();
     CM_CHK_NULL_RETURN_CMERROR(cmData);
     CM_CHK_NULL_RETURN_CMERROR(cmData->cmHalState);
     CM_CHK_NULL_RETURN_CMERROR(cmData->cmHalState->renderHal);
     cmData->cmHalState->renderHal->currentTrackerIndex = m_trackerIndex;

     CmTaskInternal* task = nullptr;
     int32_t result = CmTaskInternal::Create( kernelCount, totalThreadCount, kernelArray,
                                             threadGroupSpace, m_device, syncBitmap, task,
                                             conditionalEndBitmap, conditionalEndInfo, krnExecCfg);
     if( result != CM_SUCCESS )
     {
         CM_ASSERTMESSAGE("Error: Create CmTaskInternal failure.");
         return result;
     }

     LARGE_INTEGER nEnqueueTime;
     if ( !(MOS_QueryPerformanceCounter( (uint64_t*)&nEnqueueTime.QuadPart )))
     {
         CM_ASSERTMESSAGE("Error: Query performance counter failure.");
         CmTaskInternal::Destroy(task);
         return CM_FAILURE;
     }

     int32_t taskDriverId = -1;
     result = CreateEvent(task, !(event == CM_NO_EVENT) , taskDriverId, event);
     if (result != CM_SUCCESS)
     {
         CM_ASSERTMESSAGE("Error: Create event failure.");
         return result;
     }
     if ( event != nullptr )
     {
         event->SetEnqueueTime( nEnqueueTime );
     }

     task->SetPowerOption( powerOption );

     task->SetProperty(taskConfig);

     if( !m_enqueuedTasks.Push( task ) )
     {
         CM_ASSERTMESSAGE("Error: Push enqueued tasks failure.")
         return CM_FAILURE;
     }

     result = FlushTaskWithoutSync();

     return result;
 }

 int32_t CmQueueRT::Enqueue_RT( CmKernelRT* kernelArray[],
                         CmEventRT* & event,
                         uint32_t numTasksGenerated,
                         bool isLastTask,
                         uint32_t hints,
                         PCM_POWER_OPTION powerOption)
 {
     int32_t result = CM_FAILURE;
     uint32_t kernelCount = 0;
     CmTaskInternal* task = nullptr;
     int32_t taskDriverId = -1;
     bool isEventVisible = (event == CM_NO_EVENT) ? false:true;
     bool threadArgExists = false;

     if( kernelArray == nullptr)
     {
         CM_ASSERTMESSAGE("Error: Kernel array is NULL.");
         return CM_INVALID_ARG_VALUE;
     }
     while( kernelArray[ kernelCount ] )
     {
         kernelCount++;
     }

     if( kernelCount < CM_MINIMUM_NUM_KERNELS_ENQWHINTS )
     {
         CM_ASSERTMESSAGE("Error: EnqueueWithHints requires at least 2 kernels.");
         return CM_FAILURE;
     }

     uint32_t totalThreadCount = 0;
     for( uint32_t i = 0; i < kernelCount; i ++ )
     {
         uint32_t threadCount = 0;
         kernelArray[i]->GetThreadCount( threadCount );
         totalThreadCount += threadCount;
     }

     if( GetTaskHasThreadArg(kernelArray, kernelCount, threadArgExists) != CM_SUCCESS )
     {
         CM_ASSERTMESSAGE("Error: Thread argument checking fails.");
         return CM_FAILURE;
     }

     if( !threadArgExists )
     {
         if (totalThreadCount > m_halMaxValues->maxUserThreadsPerTaskNoThreadArg )
         {
             CM_ASSERTMESSAGE("Error: Maximum number of threads per task exceeded.");
             return CM_EXCEED_MAX_THREAD_AMOUNT_PER_ENQUEUE;
         }
     }
     else
     {
         if( totalThreadCount > m_halMaxValues->maxUserThreadsPerTask )
         {
             CM_ASSERTMESSAGE("Error: Maximum number of threads per task exceeded.");
             return CM_EXCEED_MAX_THREAD_AMOUNT_PER_ENQUEUE;
         }
     }

     CLock Locker(m_criticalSectionTaskInternal);

     // set the current tracker index in renderhal
     PCM_CONTEXT_DATA cmData = (PCM_CONTEXT_DATA)m_device->GetAccelData();
     CM_CHK_NULL_RETURN_CMERROR(cmData);
     CM_CHK_NULL_RETURN_CMERROR(cmData->cmHalState);
     CM_CHK_NULL_RETURN_CMERROR(cmData->cmHalState->renderHal);
     cmData->cmHalState->renderHal->currentTrackerIndex = m_trackerIndex;

     result = CmTaskInternal::Create( kernelCount, totalThreadCount, kernelArray, task, numTasksGenerated, isLastTask, hints, m_device );
     if( result != CM_SUCCESS )
     {
         CM_ASSERTMESSAGE("Error: Create CM task internal failure.");
         return result;
     }

     LARGE_INTEGER nEnqueueTime;
     if ( !(MOS_QueryPerformanceCounter( (uint64_t*)&nEnqueueTime.QuadPart )) )
     {
         CM_ASSERTMESSAGE("Error: Query performance counter failure.");
         CmTaskInternal::Destroy(task);
         return CM_FAILURE;
     }

     result = CreateEvent(task, isEventVisible, taskDriverId, event);
     if (result != CM_SUCCESS)
     {
         CM_ASSERTMESSAGE("Error: Create event failure.");
         return result;
     }
     if ( event != nullptr )
     {
         event->SetEnqueueTime( nEnqueueTime );
     }

     for( uint32_t i = 0; i < kernelCount; ++i )
     {
         CmKernelRT* kernel = nullptr;
         task->GetKernel(i, kernel);
         if( kernel != nullptr )
         {
             kernel->SetAdjustedYCoord(0);
         }
     }

     task->SetPowerOption( powerOption );

     if (!m_enqueuedTasks.Push(task))
     {
         CM_ASSERTMESSAGE("Error: Push enqueued tasks failure.")
         return CM_FAILURE;
     }

     result = FlushTaskWithoutSync();

     return result;
 }

 //*-----------------------------------------------------------------------------
 //! Function to enqueue task with thread group space pointer
 //! Arguments:
 //!     1. Pointer to CmTask, which can only contain one kernel.
 //!     2. Reference to the pointer to CmEvent that is to be returned
 //!     3. Pointer to a CmThreadGroupSpace.
 //! Return Value:
 //!     CM_SUCCESS if the task is successfully enqueued and the CmEvent is generated
 //!     CM_OUT_OF_HOST_MEMORY if out of host memory
 //!     CM_FAILURE otherwise
 //! Notes:
 //!     If the kernel has per thread arg, GPGPU object is to be used.
 //!     If the kernel has no per thread  arg. GPGPU walker is used.
 //*-----------------------------------------------------------------------------
 CM_RT_API int32_t CmQueueRT::EnqueueWithGroup( CmTask* task, CmEvent* & event, const CmThreadGroupSpace* threadGroupSpace)
 {
     INSERT_API_CALL_LOG();

     int32_t result;

     if(task == nullptr)
     {
         CM_ASSERTMESSAGE("Error: Kernel array is NULL.");
         return CM_INVALID_ARG_VALUE;
     }

     CmTaskRT *taskRT = static_cast<CmTaskRT *>(task);
     uint32_t count = 0;
     count = taskRT->GetKernelCount();

     if( count == 0 )
     {
         CM_ASSERTMESSAGE("Error: There are no valid kernels.");
         return CM_FAILURE;
     }

     if(m_device->IsPrintEnable())
     {
         m_device->ClearPrintBuffer();
     }

     typedef CmKernelRT* pCmKernel;
     CmKernelRT** tmp = MOS_NewArray(pCmKernel, (count+1));
     if(tmp == nullptr)
     {
         CM_ASSERTMESSAGE("Error: Out of system memory.");
         return CM_OUT_OF_HOST_MEMORY;
     }

     uint32_t totalThreadNumber = 0;
     for(uint32_t i = 0; i < count; i++)
     {
         uint32_t singleThreadNumber = 0;
         tmp[ i ] = taskRT->GetKernelPointer(i);

         //Thread arguments is not allowed in GPGPU_WALKER path
         if(tmp[i]->IsThreadArgExisted())
         {
             CM_ASSERTMESSAGE("Error: No thread Args allowed when using group space");
             MosSafeDeleteArray(tmp);
             return CM_THREAD_ARG_NOT_ALLOWED;
         }

         tmp[i]->GetThreadCount(singleThreadNumber);
         totalThreadNumber += singleThreadNumber;
     }
     tmp[count ] = nullptr;

     CmEventRT *eventRT = static_cast<CmEventRT *>(event);
     result = Enqueue_RT( tmp, count, totalThreadNumber, eventRT,
                          threadGroupSpace, taskRT->GetSyncBitmap(),
                          taskRT->GetPowerOption(),
                          taskRT->GetConditionalEndBitmap(), taskRT->GetConditionalEndInfo(),
                          taskRT->GetTaskConfig(), taskRT->GetKernelExecuteConfig());

     if (eventRT)
     {
         eventRT->SetKernelNames(taskRT, nullptr, const_cast<CmThreadGroupSpace*>(threadGroupSpace));
     }

     event = eventRT;
     MosSafeDeleteArray( tmp );

     return result;
 }

 CM_RT_API int32_t CmQueueRT::EnqueueWithHints(
                                         CmTask* kernelArray,
                                         CmEvent* & event,
                                         uint32_t hints)
 {
     INSERT_API_CALL_LOG();

     int32_t            hr                = CM_FAILURE;
     uint32_t           count             = 0;
     uint32_t           index             = 0;
     CmKernelRT**         kernels          = nullptr;
     uint32_t           numTasks          = 0;
     bool               splitTask         = false;
     bool               lastTask          = false;
     uint32_t           numTasksGenerated = 0;
     CmEventRT          *eventRT = static_cast<CmEventRT *>(event);

     if (kernelArray == nullptr)
     {
         return CM_INVALID_ARG_VALUE;
     }
     CmTaskRT         *kernelArrayRT   = static_cast<CmTaskRT *>(kernelArray);
     count = kernelArrayRT->GetKernelCount();
     if( count == 0 )
     {
         CM_ASSERTMESSAGE("Error: Invalid kernel count.");
         hr = CM_FAILURE;
         goto finish;
     }

     if( count > m_halMaxValues->maxKernelsPerTask )
     {
         CM_ASSERTMESSAGE("Error: Kernel count exceeds maximum kernel per enqueue.");
         hr = CM_EXCEED_MAX_KERNEL_PER_ENQUEUE;
         goto finish;
     }

     for (uint32_t i = 0; i < count; ++i)
     {
         CmKernelRT* kernelTmp = nullptr;
         CmThreadSpaceRT* threadSpaceTmp = nullptr;
         kernelTmp = kernelArrayRT->GetKernelPointer(i);
         CM_CHK_NULL_GOTOFINISH_CMERROR(kernelTmp);
         kernelTmp->GetThreadSpace(threadSpaceTmp);
         CM_CHK_NULL_GOTOFINISH_CMERROR(threadSpaceTmp);
         if (threadSpaceTmp->GetNeedSetKernelPointer() && threadSpaceTmp->KernelPointerIsNULL())
         {
             threadSpaceTmp->SetKernelPointer(kernelTmp);
         }
     }

 #if _DEBUG
     if( !kernelArrayRT->IntegrityCheckKernelThreadspace() )
     {
         CM_ASSERTMESSAGE("Error: Integrity check for kernel thread space failed.");
         hr = CM_KERNEL_THREADSPACE_INTEGRITY_FAILED;
         goto finish;
     }
 #endif

     numTasks = ( hints & CM_HINTS_MASK_NUM_TASKS ) >> CM_HINTS_NUM_BITS_TASK_POS;
     if( numTasks > 1 )
     {
         splitTask = true;
     }

     if( m_device->IsPrintEnable() )
     {
         m_device->ClearPrintBuffer();
     }

     kernels = MOS_NewArray(CmKernelRT*, (count + 1));
     CM_CHK_NULL_GOTOFINISH_CMERROR(kernels);

     do
     {
         for (index = 0; index < count; ++index)
         {
             kernels[ index ] = kernelArrayRT->GetKernelPointer( index );
         }

         kernels[ count ] = nullptr;

         if(splitTask)
         {
             if( numTasksGenerated == (numTasks - 1 ) )
             {
                 lastTask = true;
             }
         }
         else
         {
             lastTask = true;
         }

         CM_CHK_CMSTATUS_GOTOFINISH(Enqueue_RT( kernels, eventRT, numTasksGenerated, lastTask, hints, kernelArrayRT->GetPowerOption() ));
         event = eventRT;
         numTasksGenerated++;

     }while(numTasksGenerated < numTasks);

 finish:
     MosSafeDeleteArray( kernels );

     return hr;
 }

 //*-----------------------------------------------------------------------------
 //! Enqueue an task, which contains one pre-defined kernel to
 //! copy from host memory to surface
 //! This is a non-blocking call. i.e. it returns immediately without waiting for
 //! GPU to finish the execution of the task.
 //! A CmEvent is generated each time a task is enqueued. The CmEvent can
 //! be used to check if the task finishs.
 //! INPUT:
 //!     1) Pointer to the CmSurface2D_RT as copy destination
 //!     2) Pointer to the host memory as copy source
 //!     3) Reference to the pointer to CMEvent
 //!     4) A boolean value to indicate if or not to flush the queue after enqueue the task
 //!        by default the boolean value is TRUE.
 //! OUTPUT:
 //!     CM_SUCCESS if the task is successfully enqueued and the CmEvent is generated;
 //!     CM_OUT_OF_HOST_MEMORY if out of host memery;
 //!     CM_FAILURE otherwise.
 //!     More error code is coming.
 //*-----------------------------------------------------------------------------
 CM_RT_API int32_t CmQueueRT::EnqueueCopyCPUToGPU( CmSurface2D* surface, const unsigned char* sysMem, CmEvent* & event )
 {
     INSERT_API_CALL_LOG();

     if (!m_device->HasGpuCopyKernel())
     {
         return CM_NOT_IMPLEMENTED;
     }

     CmSurface2DRT *surfaceRT = static_cast<CmSurface2DRT *>(surface);
     return EnqueueCopyInternal(surfaceRT, (unsigned char*)sysMem, 0, 0, CM_FASTCOPY_CPU2GPU, CM_FASTCOPY_OPTION_NONBLOCKING, event);
 }

 //*-----------------------------------------------------------------------------
 //! Enqueue an task, which contains one pre-defined kernel to
 //! copy from surface to host memory
 //! This is a non-blocking call. i.e. it returns immediately without waiting for
 //! GPU to finish the execution of the task.
 //! A CmEvent is generated each time a task is enqueued. The CmEvent can
 //! be used to check if the task finishs.
 //! INPUT:
 //!     1) Pointer to the CmSurface2D_RT as copy source
 //!     2) Pointer to the host memory as copy destination
 //!     3) Reference to the pointer to CMEvent
 //!     4) A boolean value to indicate if or not to flush the queue after enqueue the task
 //!        by default the boolean value is TRUE.
 //! OUTPUT:
 //!     CM_SUCCESS if the task is successfully enqueued and the CmEvent is generated;
 //!     CM_OUT_OF_HOST_MEMORY if out of host memery;
 //!     CM_FAILURE otherwise.
 //!     More error code is coming.
 //*-----------------------------------------------------------------------------
 CM_RT_API int32_t CmQueueRT::EnqueueCopyGPUToCPU( CmSurface2D* surface, unsigned char* sysMem, CmEvent* & event )
 {
     INSERT_API_CALL_LOG();

     if (!m_device->HasGpuCopyKernel())
     {
         return CM_NOT_IMPLEMENTED;
     }

     CmSurface2DRT *surfaceRT = static_cast<CmSurface2DRT *>(surface);
     return EnqueueCopyInternal(surfaceRT, sysMem, 0, 0, CM_FASTCOPY_GPU2CPU, CM_FASTCOPY_OPTION_NONBLOCKING, event);
 }

 int32_t CmQueueRT::EnqueueUnalignedCopyInternal( CmSurface2DRT* surface, unsigned char* sysMem, const uint32_t widthStride, const uint32_t heightStride, CM_GPUCOPY_DIRECTION direction)
 {
     int32_t         hr                          = CM_SUCCESS;
     uint32_t        bufferupSize               = 0;
     uint32_t        dstAddShiftOffset           = 0;
     uint32_t        threadWidth                 = 0;
     uint32_t        threadHeight                = 0;
     uint32_t        threadNum                   = 0;
     uint32_t        auxiliaryBufferupSize     = 0;
     uint32_t        width                       = 0;
     uint32_t        height                      = 0;
     uint32_t        sizePerPixel                = 0;
     uint32_t        widthByte                  = 0;
     uint32_t        copyWidthByte             = 0;
     uint32_t        copyHeightRow             = 0;
     uint32_t        strideInBytes             = widthStride;
     uint32_t        heightStrideInRows       = heightStride;
     size_t          linearAddress              = (size_t)sysMem;
     size_t          linearAddressAligned       = 0;
     unsigned char*  hybridCopyAuxSysMem        = nullptr;

     CmBufferUP             *bufferUP                  = nullptr;
     CmKernel               *kernel                    = nullptr;
     CmBufferUP             *hybridCopyAuxBufferUP     = nullptr;
     SurfaceIndex           *bufferIndexCM             = nullptr;
     SurfaceIndex           *hybridCopyAuxIndexCM      = nullptr;
     SurfaceIndex           *surf2DIndexCM             = nullptr;
     CmThreadSpace          *threadSpace               = nullptr;
     CmTask                 *gpuCopyTask               = nullptr;
     CmProgram              *gpuCopyProgram            = nullptr;
     CmEvent                *event                     = nullptr;
     CM_STATUS              status;
     CM_SURFACE_FORMAT      format;

     if ( surface )
     {
         CM_CHK_CMSTATUS_GOTOFINISH( surface->GetSurfaceDesc(width, height, format, sizePerPixel));
     }
     else
     {
         return CM_FAILURE;
     }

     widthByte                  = width * sizePerPixel;
     // the actual copy region
     copyWidthByte             = MOS_MIN(strideInBytes, widthByte);
     copyHeightRow             = MOS_MIN(heightStrideInRows, height);

     if(linearAddress == 0)
     {
         CM_ASSERTMESSAGE("Error: Pointer to system memory is null.");
         return CM_INVALID_ARG_VALUE;
     }
     if( (copyWidthByte > CM_MAX_THREADSPACE_WIDTH_FOR_MW * BLOCK_WIDTH ) || ( copyHeightRow > CM_MAX_THREADSPACE_HEIGHT_FOR_MW * BLOCK_HEIGHT) )
     {  // each thread handles 64x8 block data. This API will fail if it exceeds the max thread space's size
         CM_ASSERTMESSAGE("Error: Invalid copy size.");
         return CM_INVALID_ARG_SIZE;
     }

     if (sizeof (void *) == 8 ) //64-bit
     {
         linearAddressAligned        = linearAddress & ADDRESS_PAGE_ALIGNMENT_MASK_X64;
     }
     else  //32-bit
     {
         linearAddressAligned        = linearAddress & ADDRESS_PAGE_ALIGNMENT_MASK_X86;
     }
     //Calculate  Left Shift offset
     dstAddShiftOffset               = (uint32_t)(linearAddress - linearAddressAligned);

     if (format == CM_SURFACE_FORMAT_NV12 || format == CM_SURFACE_FORMAT_P010 || format == CM_SURFACE_FORMAT_P016)
     {
         bufferupSize = MOS_ALIGN_CEIL(strideInBytes * (heightStrideInRows + copyHeightRow * 1/2) + (uint32_t)dstAddShiftOffset , 64);
     }
     else
     {
         bufferupSize = MOS_ALIGN_CEIL(strideInBytes * heightStrideInRows  + (uint32_t)dstAddShiftOffset, 64);
     }

     CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateBufferUP(bufferupSize, ( void * )linearAddressAligned, bufferUP));
     CM_CHK_CMSTATUS_GOTOFINISH(bufferUP->GetIndex(bufferIndexCM));
     CM_CHK_CMSTATUS_GOTOFINISH(surface->GetIndex(surf2DIndexCM));

     CM_CHK_CMSTATUS_GOTOFINISH( m_device->LoadPredefinedCopyKernel(gpuCopyProgram));
     CM_CHK_NULL_GOTOFINISH_CMERROR(gpuCopyProgram);

     if (direction == CM_FASTCOPY_CPU2GPU)
     {
         if (format == CM_SURFACE_FORMAT_NV12 || format == CM_SURFACE_FORMAT_P010 || format == CM_SURFACE_FORMAT_P016)
         {
             CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateKernel(gpuCopyProgram, _NAME(surfaceCopy_write_unaligned_NV12), kernel, "PredefinedGPUCopyKernel"));
         }
         else
         {
             CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateKernel(gpuCopyProgram, _NAME(surfaceCopy_write_unaligned), kernel, "PredefinedGPUCopyKernel"));

         }
         CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 0, sizeof( SurfaceIndex ), bufferIndexCM ));
         CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 1, sizeof( SurfaceIndex ), surf2DIndexCM ));
     }
     else
     {
         if (format == CM_SURFACE_FORMAT_NV12 || format == CM_SURFACE_FORMAT_P010 || format == CM_SURFACE_FORMAT_P016)
         {
             CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateKernel(gpuCopyProgram, _NAME(surfaceCopy_read_unaligned_NV12), kernel, "PredefinedGPUCopyKernel"));
             auxiliaryBufferupSize = BLOCK_WIDTH * 2 * (heightStrideInRows + copyHeightRow * 1/2);
         }
         else
         {
             CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateKernel(gpuCopyProgram, _NAME(surfaceCopy_read_unaligned), kernel, "PredefinedGPUCopyKernel"));
             auxiliaryBufferupSize = BLOCK_WIDTH * 2 * heightStrideInRows;
         }
         hybridCopyAuxSysMem = (unsigned char*)MOS_AlignedAllocMemory(auxiliaryBufferupSize, PAGE_ALIGNED);
         if(!hybridCopyAuxSysMem)
         {
             CM_ASSERTMESSAGE("Error: Out of system memory.");
             return CM_OUT_OF_HOST_MEMORY;
         }
         CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateBufferUP(auxiliaryBufferupSize, (void*)hybridCopyAuxSysMem, hybridCopyAuxBufferUP));
         CM_CHK_CMSTATUS_GOTOFINISH(hybridCopyAuxBufferUP->GetIndex(hybridCopyAuxIndexCM));

         CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 0, sizeof( SurfaceIndex ), surf2DIndexCM ));
         CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 1, sizeof( SurfaceIndex ), bufferIndexCM ));
         CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 5, sizeof( uint32_t ), &copyWidthByte ));
         CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 6, sizeof( SurfaceIndex ), hybridCopyAuxIndexCM ));
     }

     CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 2, sizeof( uint32_t ), &strideInBytes ));
     CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 3, sizeof( uint32_t ), &heightStrideInRows ));
     CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 4, sizeof( uint32_t ), &dstAddShiftOffset ));

     threadWidth = ( uint32_t )ceil( ( double )copyWidthByte/BLOCK_WIDTH );
     threadHeight = ( uint32_t )ceil( ( double )copyHeightRow/BLOCK_HEIGHT );

     threadNum = threadWidth * threadHeight;
     CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetThreadCount( threadNum ));

     CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateThreadSpace( threadWidth, threadHeight, threadSpace ));
     CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateTask(gpuCopyTask));
     CM_CHK_CMSTATUS_GOTOFINISH(gpuCopyTask->AddKernel( kernel ));
     CM_CHK_CMSTATUS_GOTOFINISH(EnqueueFast(gpuCopyTask, event, threadSpace));

     if(event)
     {
         CM_CHK_CMSTATUS_GOTOFINISH(event->GetStatus(status));
         while(status != CM_STATUS_FINISHED)
         {
             if (status == CM_STATUS_RESET)
             {
                 hr = CM_TASK_MEDIA_RESET;
                 goto finish;
             }
             CM_CHK_CMSTATUS_GOTOFINISH(event->GetStatus(status));
         }
     }
     // CPU copy unaligned data
     if( direction == CM_FASTCOPY_GPU2CPU)
     {
         uint32_t readOffset = 0;
         uint32_t copyLines = 0;
         unsigned char* startBuffer = (unsigned char*)linearAddressAligned;

         copyLines = (format == CM_SURFACE_FORMAT_NV12 || format == CM_SURFACE_FORMAT_P010 || format == CM_SURFACE_FORMAT_P016) ? heightStrideInRows + MOS_MIN(heightStrideInRows, height) * 1 / 2 : heightStrideInRows;

         for(uint32_t i = 0; i < copyLines; ++i)
         {
             //copy begining of line
             size_t beginLineWriteOffset = strideInBytes * i + dstAddShiftOffset;
             uint32_t mod = ((uintptr_t)startBuffer + beginLineWriteOffset) < BLOCK_WIDTH ? ((uintptr_t)startBuffer + beginLineWriteOffset) : ((uintptr_t)startBuffer + beginLineWriteOffset) & (BLOCK_WIDTH - 1);
             uint32_t beginLineCopySize = (mod == 0) ? 0:(BLOCK_WIDTH - mod);
             //fix copy size for cases where the surface width is small
             if((beginLineCopySize > widthByte) || ( beginLineCopySize == 0 && widthByte < BLOCK_WIDTH ) )
             {
                 beginLineCopySize = widthByte;
             }
             if(beginLineCopySize > 0)
             {
                 CmSafeMemCopy((void *)( (unsigned char *)startBuffer + beginLineWriteOffset), (void *)(hybridCopyAuxSysMem + readOffset), beginLineCopySize);
             }

             //copy end of line
             uint32_t alignedWrites = (copyWidthByte - beginLineCopySize) &~ (BLOCK_WIDTH - 1);
             uint32_t endLineWriteOffset = beginLineWriteOffset + alignedWrites + beginLineCopySize;
             uint32_t endLineCopySize = dstAddShiftOffset+ i * strideInBytes + copyWidthByte - endLineWriteOffset;
             if(endLineCopySize > 0 && endLineWriteOffset > beginLineWriteOffset)
             {
                 CmSafeMemCopy((void *)((unsigned char *)startBuffer + endLineWriteOffset), (void *)(hybridCopyAuxSysMem + readOffset + BLOCK_WIDTH), endLineCopySize);
             }
             readOffset += (BLOCK_WIDTH * 2);
         }
     }

     CM_CHK_CMSTATUS_GOTOFINISH(DestroyEventFast(event));
     CM_CHK_CMSTATUS_GOTOFINISH(m_device->DestroyTask(gpuCopyTask));
     CM_CHK_CMSTATUS_GOTOFINISH(m_device->DestroyThreadSpace(threadSpace));
     CM_CHK_CMSTATUS_GOTOFINISH(m_device->DestroyBufferUP(bufferUP));
     if (direction == CM_FASTCOPY_GPU2CPU)
     {
         if(hybridCopyAuxBufferUP)
         {
             CM_CHK_CMSTATUS_GOTOFINISH(m_device->DestroyBufferUP(hybridCopyAuxBufferUP));
         }
         if(hybridCopyAuxSysMem)
         {
             MOS_AlignedFreeMemory(hybridCopyAuxSysMem);
             hybridCopyAuxSysMem = nullptr;
         }
     }
 finish:
     if(hr != CM_SUCCESS)
     {
         if(bufferUP == nullptr)
         {
             // user need to know whether the failure is caused by out of BufferUP.
             hr = CM_GPUCOPY_OUT_OF_RESOURCE;
         }

         if(event)                          DestroyEventFast(event);
         if(kernel)                         m_device->DestroyKernel(kernel);
         if(threadSpace)                    m_device->DestroyThreadSpace(threadSpace);
         if(gpuCopyTask)                    m_device->DestroyTask(gpuCopyTask);
         if(bufferUP)                       m_device->DestroyBufferUP(bufferUP);
         if(hybridCopyAuxBufferUP)          m_device->DestroyBufferUP(hybridCopyAuxBufferUP);
         if(hybridCopyAuxSysMem)            {MOS_AlignedFreeMemory(hybridCopyAuxSysMem); hybridCopyAuxSysMem = nullptr;}
     }

     return hr;
 }
 //*-----------------------------------------------------------------------------
 //! Enqueue an task, which contains one pre-defined kernel to
 //! copy from surface to host memory or from host memory to surface
 //! This is a non-blocking call. i.e. it returns immediately without waiting for
 //! GPU to finish the execution of the task.
 //! A CmEvent is generated each time a task is enqueued. The CmEvent can
 //! be used to check if the task finishes.
 //! INPUT:
 //!     1) Pointer to the CmSurface2D
 //!     2) Pointer to the host memory
 //!     3) Width stride in bytes, if there is no padding in system memroy, it is set to zero.
 //!     4) Height stride in row, if there is no padding in system memroy, it is set to zero.
 //!     4) Copy direction, cpu->gpu (linear->tiled) or gpu->cpu(tiled->linear)
 //!     5) Reference to the pointer to CMEvent
 //! OUTPUT:
 //!     CM_SUCCESS if the task is successfully enqueued and the CmEvent is generated;
 //!     CM_OUT_OF_HOST_MEMORY if out of host memery;
 //!     CM_FAILURE otherwise.
 //*-----------------------------------------------------------------------------
 int32_t CmQueueRT::EnqueueCopyInternal(CmSurface2DRT* surface,
                                 unsigned char* sysMem,
                                 const uint32_t widthStride,
                                 const uint32_t heightStride,
                                 CM_GPUCOPY_DIRECTION direction,
                                 const uint32_t option,
                                 CmEvent* & event)
 {
     int32_t hr                  = CM_FAILURE;
     uint32_t width               = 0;
     uint32_t height              = 0;
     uint32_t sizePerPixel        = 0;
     CM_SURFACE_FORMAT format    = CM_SURFACE_FORMAT_INVALID;

     if (surface)
     {
         CM_CHK_CMSTATUS_GOTOFINISH(surface->GetSurfaceDesc(width, height, format, sizePerPixel));
     }
     else
     {
         return CM_GPUCOPY_INVALID_SURFACES;
     }

     if (format == CM_SURFACE_FORMAT_NV12 || format == CM_SURFACE_FORMAT_P010 || format == CM_SURFACE_FORMAT_P016)
     {
         hr = EnqueueCopyInternal_2Planes(surface, (unsigned char*)sysMem, format, width, widthStride, height, heightStride, sizePerPixel, direction, option, event);
     }
     else
     {
         hr = EnqueueCopyInternal_1Plane(surface, (unsigned char*)sysMem, format, width, widthStride, height, heightStride, sizePerPixel, direction, option, event);
     }

 finish:
     return hr;
 }

 int32_t CmQueueRT::EnqueueCopyInternal_1Plane(CmSurface2DRT* surface,
                                     unsigned char* sysMem,
                                     CM_SURFACE_FORMAT format,
                                     const uint32_t widthInPixel,
                                     const uint32_t widthStride,
                                     const uint32_t heightInRow,
                                     const uint32_t heightStride,
                                     const uint32_t sizePerPixel,
                                     CM_GPUCOPY_DIRECTION direction,
                                     const uint32_t option,
                                     CmEvent* & event )
 {
     int32_t         hr                      = CM_SUCCESS;
     uint32_t        tempHeight              = heightInRow;
     uint32_t        strideInBytes         = widthStride;
     uint32_t        strideInDwords        = 0;
     uint32_t        heightStrideInRows   = heightStride;
     uint32_t        addedShiftLeftOffset    = 0;
     size_t          linearAddress          = (size_t)sysMem;
     size_t          linearAddressAligned   = 0;

     CmKernel        *kernel            = nullptr;
     CmBufferUP      *cmbufferUP        = nullptr;
     SurfaceIndex    *bufferIndexCM     = nullptr;
     SurfaceIndex    *surf2DIndexCM     = nullptr;
     CmThreadSpace   *threadSpace                = nullptr;
     CmTask          *gpuCopyTask       = nullptr;
     CmEvent         *internalEvent     = nullptr;

     uint32_t        threadWidth             = 0;
     uint32_t        threadHeight            = 0;
     uint32_t        threadNum               = 0;
     uint32_t        widthDword             = 0;
     uint32_t        widthByte              = 0;
     uint32_t        copyWidthByte         = 0;
     uint32_t        copyHeightRow         = 0;
     uint32_t        sliceCopyHeightRow   = 0;
     uint32_t        sliceCopyBufferUPSize   = 0;
     int32_t         totalBufferUPSize       = 0;
     uint32_t        startX                 = 0;
     uint32_t        startY                 = 0;
     bool            blSingleEnqueue         = true;
     CM_GPUCOPY_KERNEL *gpuCopyKernelParam     = nullptr;

     PCM_HAL_STATE   cmHalState    =        \
         ((PCM_CONTEXT_DATA)m_device->GetAccelData())->cmHalState;

     widthByte    = widthInPixel * sizePerPixel;

     //Align the width regarding stride
    if(strideInBytes == 0)
    {
         strideInBytes = widthByte;
    }

    if(heightStrideInRows == 0)
    {
         heightStrideInRows = heightInRow;
    }

     // the actual copy region
     copyWidthByte = MOS_MIN(strideInBytes, widthByte);
     copyHeightRow = MOS_MIN(heightStrideInRows, heightInRow);

     // Make sure stride and start address of system memory is 16-byte aligned.
     // if no padding in system memory , strideInBytes = widthByte.
     if(strideInBytes & 0xf)
     {
         CM_ASSERTMESSAGE("Error: Stride is not 16-byte aligned.");
         return CM_GPUCOPY_INVALID_STRIDE;
     }
     if((linearAddress & 0xf) || (linearAddress == 0))
     {
         CM_ASSERTMESSAGE("Error: Start address of system memory is not 16-byte aligned.");
         return CM_GPUCOPY_INVALID_SYSMEM;
     }

     //Calculate actual total size of system memory
     totalBufferUPSize = strideInBytes * heightStrideInRows;

     //Check thread space width here
     if( copyWidthByte > CM_MAX_THREADSPACE_WIDTH_FOR_MW * BLOCK_PIXEL_WIDTH *4 )
     {  // each thread handles 128x8 block data. This API will fail if it exceeds the max thread space's size
         CM_ASSERTMESSAGE("Error: Invalid copy size.");
         return CM_GPUCOPY_INVALID_SIZE;
     }

     while (totalBufferUPSize > 0)
     {
         if (sizeof (void *) == 8 ) //64-bit
         {
             linearAddressAligned        = linearAddress & ADDRESS_PAGE_ALIGNMENT_MASK_X64;
         }
         else  //32-bit
         {
             linearAddressAligned        = linearAddress & ADDRESS_PAGE_ALIGNMENT_MASK_X86;
         }

         //Calculate  Left Shift offset
         addedShiftLeftOffset = (uint32_t)(linearAddress - linearAddressAligned);
         totalBufferUPSize   += addedShiftLeftOffset;

         if (totalBufferUPSize > CM_MAX_1D_SURF_WIDTH)
         {
             blSingleEnqueue = false;
             sliceCopyHeightRow = ((CM_MAX_1D_SURF_WIDTH - addedShiftLeftOffset)/(strideInBytes*(BLOCK_HEIGHT * INNER_LOOP))) * (BLOCK_HEIGHT * INNER_LOOP);
             sliceCopyBufferUPSize = sliceCopyHeightRow * strideInBytes + addedShiftLeftOffset;
             tempHeight = sliceCopyHeightRow;
         }
         else
         {
             sliceCopyHeightRow = copyHeightRow;
             sliceCopyBufferUPSize = totalBufferUPSize;
             if (!blSingleEnqueue)
             {
                 tempHeight = sliceCopyHeightRow;
             }
         }

         //Check thread space height here
         if(sliceCopyHeightRow > CM_MAX_THREADSPACE_HEIGHT_FOR_MW * BLOCK_HEIGHT * INNER_LOOP )
         {  // each thread handles 128x8 block data. This API will fail if it exceeds the max thread space's size
             CM_ASSERTMESSAGE("Error: Invalid copy size.");
             return CM_GPUCOPY_INVALID_SIZE;
         }

         kernel = nullptr;
         CM_CHK_CMSTATUS_GOTOFINISH( m_device->CreateBufferUP(  sliceCopyBufferUPSize, ( void * )linearAddressAligned, cmbufferUP ));
         CM_CHK_NULL_GOTOFINISH_CMERROR(cmbufferUP);

         //Configure memory object control for BufferUP to solve the cache-line issue.
         if (cmHalState->cmHalInterface->IsGPUCopySurfaceNoCacheWARequired())
         {
             CM_CHK_CMSTATUS_GOTOFINISH(cmbufferUP->SelectMemoryObjectControlSetting(MEMORY_OBJECT_CONTROL_SKL_NO_LLC_L3));
         }
         CM_CHK_CMSTATUS_GOTOFINISH(CreateGPUCopyKernel(copyWidthByte, sliceCopyHeightRow, format, direction, gpuCopyKernelParam));
         CM_CHK_NULL_GOTOFINISH_CMERROR(gpuCopyKernelParam);
         kernel = gpuCopyKernelParam->kernel;

         CM_CHK_NULL_GOTOFINISH_CMERROR(kernel);

         CM_CHK_NULL_GOTOFINISH_CMERROR(cmbufferUP);
         CM_CHK_CMSTATUS_GOTOFINISH(cmbufferUP->GetIndex( bufferIndexCM ));
         CM_CHK_CMSTATUS_GOTOFINISH(surface->GetIndex( surf2DIndexCM ));

         threadWidth = ( uint32_t )ceil( ( double )copyWidthByte/BLOCK_PIXEL_WIDTH/4 );
         threadHeight = ( uint32_t )ceil( ( double )sliceCopyHeightRow/BLOCK_HEIGHT/INNER_LOOP );
         threadNum = threadWidth * threadHeight;
         CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetThreadCount( threadNum ));
         CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateThreadSpace( threadWidth, threadHeight, threadSpace ));

         if(direction == CM_FASTCOPY_GPU2CPU)
         {
             surface->SetReadSyncFlag(true, this); // GPU -> CPU, set surf2d as read sync flag
         }

         if( direction == CM_FASTCOPY_CPU2GPU)
         {
             if (cmHalState->cmHalInterface->IsSurfaceCompressionWARequired())
             {
                 CM_CHK_CMSTATUS_GOTOFINISH(surface->SetCompressionMode(MEMCOMP_DISABLED));
             }
             CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 0, sizeof( SurfaceIndex ), bufferIndexCM) );
             CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 1, sizeof( SurfaceIndex ), surf2DIndexCM ));
         }
         else
         {
             CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 1, sizeof( SurfaceIndex ), bufferIndexCM ));
             CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 0, sizeof( SurfaceIndex ), surf2DIndexCM ));
         }


         widthDword = (uint32_t)ceil((double)widthByte / 4);
         strideInDwords = (uint32_t)ceil((double)strideInBytes / 4);

         CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 2, sizeof( uint32_t ), &strideInDwords ));
         CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 3, sizeof( uint32_t ), &heightStrideInRows ));
         CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 4, sizeof( uint32_t ), &addedShiftLeftOffset ));
         CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 5, sizeof( uint32_t ), &threadHeight ));

         if (direction == CM_FASTCOPY_GPU2CPU)  //GPU-->CPU, read
         {
             CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 6, sizeof( uint32_t ), &widthDword ));
             CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 7, sizeof( uint32_t ), &tempHeight ));
             CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 8, sizeof(uint32_t), &startX));
             CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 9, sizeof(uint32_t), &startY));
         }
         else  //CPU-->GPU, write
         {
             //this only works for the kernel surfaceCopy_write_32x32
             CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 6, sizeof( uint32_t ), &startX ));
             CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 7, sizeof( uint32_t ), &startY ));
         }

         CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateTask(gpuCopyTask));
         CM_CHK_CMSTATUS_GOTOFINISH(gpuCopyTask->AddKernel( kernel ));
         if (option & CM_FASTCOPY_OPTION_DISABLE_TURBO_BOOST)
         {
             // disable turbo
             CM_TASK_CONFIG taskConfig;
             CmSafeMemSet(&taskConfig, 0, sizeof(CM_TASK_CONFIG));
             taskConfig.turboBoostFlag = CM_TURBO_BOOST_DISABLE;
             gpuCopyTask->SetProperty(taskConfig);
         }
         CM_CHK_CMSTATUS_GOTOFINISH(EnqueueFast(gpuCopyTask, internalEvent,
                                            threadSpace));

         GPUCOPY_KERNEL_UNLOCK(gpuCopyKernelParam);

         //update for next slice
         linearAddress += sliceCopyBufferUPSize - addedShiftLeftOffset;
         totalBufferUPSize -= sliceCopyBufferUPSize;
         copyHeightRow -= sliceCopyHeightRow;
         startX = 0;
         startY += sliceCopyHeightRow;

         if(totalBufferUPSize > 0)   //Intermediate event, we don't need it
         {
             CM_CHK_CMSTATUS_GOTOFINISH(DestroyEventFast(internalEvent));
         }
         else //Last one event, need keep or destroy it
         {
             if ((option & CM_FASTCOPY_OPTION_BLOCKING) && (internalEvent))
             {
                 CM_CHK_CMSTATUS_GOTOFINISH(internalEvent->WaitForTaskFinished());
             }

             if(event == CM_NO_EVENT)  //User doesn't need CmEvent for this copy
             {
                 event = nullptr;
                 CM_CHK_CMSTATUS_GOTOFINISH(DestroyEventFast(internalEvent));
             }
             else //User needs this CmEvent
             {
                 event = internalEvent;
             }
         }

         CM_CHK_CMSTATUS_GOTOFINISH(m_device->DestroyTask(gpuCopyTask));
         CM_CHK_CMSTATUS_GOTOFINISH(m_device->DestroyThreadSpace(threadSpace));
         CM_CHK_CMSTATUS_GOTOFINISH(m_device->DestroyBufferUP(cmbufferUP));
     }

 finish:

     if(hr != CM_SUCCESS)
     {
         if(cmbufferUP == nullptr)
         {
             // user need to know whether the failure is caused by out of BufferUP.
             hr = CM_GPUCOPY_OUT_OF_RESOURCE;
         }

         if(kernel && gpuCopyKernelParam)        GPUCOPY_KERNEL_UNLOCK(gpuCopyKernelParam);
         if(threadSpace)                                m_device->DestroyThreadSpace(threadSpace);
         if(gpuCopyTask)                       m_device->DestroyTask(gpuCopyTask);
         if(cmbufferUP)                        m_device->DestroyBufferUP(cmbufferUP);
         if(internalEvent)                     DestroyEventFast(internalEvent);

         // CM_FAILURE for all the other errors
         // return CM_EXCEED_MAX_TIMEOUT to notify app that gpu reset happens
         if( hr != CM_GPUCOPY_OUT_OF_RESOURCE && hr != CM_EXCEED_MAX_TIMEOUT)
         {
             hr = CM_FAILURE;
         }
     }

     return hr;
 }

 int32_t CmQueueRT::EnqueueCopyInternal_2Planes(CmSurface2DRT* surface,
                                         unsigned char* sysMem,
                                         CM_SURFACE_FORMAT format,
                                         const uint32_t widthInPixel,
                                         const uint32_t widthStride,
                                         const uint32_t heightInRow,
                                         const uint32_t heightStride,
                                         const uint32_t sizePerPixel,
                                         CM_GPUCOPY_DIRECTION direction,
                                         const uint32_t option,
                                         CmEvent* & event)
 {
     int32_t         hr                      = CM_SUCCESS;
     uint32_t        strideInBytes         = widthStride;
     uint32_t        strideInDwords        = 0;
     uint32_t        heightStrideInRows   = heightStride;
     size_t          linearAddressY        = 0;
     size_t          linearAddressUV       = 0;
     size_t          linearAddressAlignedY = 0;
     size_t          linearAddressAlignedUV = 0;
     uint32_t        addedShiftLeftOffsetY  = 0;
     uint32_t        addedShiftLeftOffsetUV = 0;

     CmKernel        *kernel                = nullptr;
     CmBufferUP      *cmbufferUPY          = nullptr;
     CmBufferUP      *cmbufferUPUV         = nullptr;
     SurfaceIndex    *bufferUPIndexY       = nullptr;
     SurfaceIndex    *bufferUPIndexUV      = nullptr;
     SurfaceIndex    *surf2DIndexCM         = nullptr;
     CmThreadSpace   *threadSpace           = nullptr;
     CmTask          *gpuCopyTask           = nullptr;
     CmEvent         *internalEvent         = nullptr;

     uint32_t        threadWidth             = 0;
     uint32_t        threadHeight            = 0;
     uint32_t        threadNum               = 0;
     uint32_t        widthDword             = 0;
     uint32_t        widthByte              = 0;
     uint32_t        copyWidthByte         = 0;
     uint32_t        copyHeightRow         = 0;
     uint32_t        bufferUPYSize         = 0;
     uint32_t        bufferUPUVSize        = 0;

     CM_GPUCOPY_KERNEL *gpuCopyKernelParam = nullptr;
     PCM_HAL_STATE       cmHalState    =      \
         ((PCM_CONTEXT_DATA)m_device->GetAccelData())->cmHalState;

     widthByte = widthInPixel * sizePerPixel;

     //Align the width regarding stride
     if (strideInBytes == 0)
     {
         strideInBytes = widthByte;
     }

     if (heightStrideInRows == 0)
     {
         heightStrideInRows = heightInRow;
     }

     // the actual copy region
     copyWidthByte = MOS_MIN(strideInBytes, widthByte);
     copyHeightRow = MOS_MIN(heightStrideInRows, heightInRow);

     // Make sure stride and start address of system memory is 16-byte aligned.
     // if no padding in system memory , strideInBytes = widthByte.
     if (strideInBytes & 0xf)
     {
         CM_ASSERTMESSAGE("Error: Stride is not 16-byte aligned.");
         return CM_GPUCOPY_INVALID_STRIDE;
     }

     //Check thread space width here
     if (copyWidthByte > CM_MAX_THREADSPACE_WIDTH_FOR_MW * BLOCK_PIXEL_WIDTH * 4)
     {  // each thread handles 128x8 block data. This API will fail if it exceeds the max thread space's size
         CM_ASSERTMESSAGE("Error: Invalid copy size.");
         return CM_GPUCOPY_INVALID_SIZE;
     }

     linearAddressY = (size_t)sysMem;
     linearAddressUV = (size_t)((char*)sysMem + strideInBytes * heightStrideInRows);

     if ((linearAddressY & 0xf) || (linearAddressY == 0) || (linearAddressAlignedUV & 0xf))
     {
         CM_ASSERTMESSAGE("Error: Start address of system memory is not 16-byte aligned.");
         return CM_GPUCOPY_INVALID_SYSMEM;
     }

     if (sizeof (void *) == 8) //64-bit
     {
         linearAddressAlignedY = linearAddressY & ADDRESS_PAGE_ALIGNMENT_MASK_X64;
         linearAddressAlignedUV = linearAddressUV & ADDRESS_PAGE_ALIGNMENT_MASK_X64;
     }
     else  //32-bit
     {
         linearAddressAlignedY = linearAddressY & ADDRESS_PAGE_ALIGNMENT_MASK_X86;
         linearAddressAlignedUV = linearAddressUV & ADDRESS_PAGE_ALIGNMENT_MASK_X86;
     }

     //Calculate  Left Shift offset
     addedShiftLeftOffsetY = (uint32_t)(linearAddressY - linearAddressAlignedY);
     addedShiftLeftOffsetUV = (uint32_t)(linearAddressUV - linearAddressAlignedUV);

     //Calculate actual total size of system memory, assume it's NV12/P010/P016 formats
     bufferUPYSize = strideInBytes * heightStrideInRows + addedShiftLeftOffsetY;
     bufferUPUVSize = strideInBytes * copyHeightRow * 1 / 2 + addedShiftLeftOffsetUV;

     //Check thread space height here
     if (copyHeightRow > CM_MAX_THREADSPACE_HEIGHT_FOR_MW * BLOCK_HEIGHT * INNER_LOOP)
     {  // each thread handles 128x8 block data. This API will fail if it exceeds the max thread space's size
         CM_ASSERTMESSAGE("Error: Invalid copy size.");
         return CM_GPUCOPY_INVALID_SIZE;
     }

     kernel = nullptr;
     CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateBufferUP(bufferUPYSize, (void *)linearAddressAlignedY, cmbufferUPY));
     CM_CHK_NULL_GOTOFINISH_CMERROR(cmbufferUPY);
     CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateBufferUP(bufferUPUVSize, (void *)linearAddressAlignedUV, cmbufferUPUV));
     CM_CHK_NULL_GOTOFINISH_CMERROR(cmbufferUPUV);

     //Configure memory object control for the two BufferUP to solve the same cache-line coherency issue.
     if (cmHalState->cmHalInterface->IsGPUCopySurfaceNoCacheWARequired())
     {
         CM_CHK_CMSTATUS_GOTOFINISH(cmbufferUPY->SelectMemoryObjectControlSetting(MEMORY_OBJECT_CONTROL_SKL_NO_LLC_L3));
         CM_CHK_CMSTATUS_GOTOFINISH(cmbufferUPUV->SelectMemoryObjectControlSetting(MEMORY_OBJECT_CONTROL_SKL_NO_LLC_L3));
     }
     else
     {
         CM_CHK_CMSTATUS_GOTOFINISH(static_cast< CmBuffer_RT* >(cmbufferUPY)->SetMemoryObjectControl(MEMORY_OBJECT_CONTROL_FROM_GTT_ENTRY, CM_WRITE_THROUGH, 0));
         CM_CHK_CMSTATUS_GOTOFINISH(static_cast< CmBuffer_RT* >(cmbufferUPUV)->SetMemoryObjectControl(MEMORY_OBJECT_CONTROL_FROM_GTT_ENTRY, CM_WRITE_THROUGH, 0));
     }

     CM_CHK_CMSTATUS_GOTOFINISH(CreateGPUCopyKernel(copyWidthByte, copyHeightRow, format, direction, gpuCopyKernelParam));
     CM_CHK_NULL_GOTOFINISH_CMERROR(gpuCopyKernelParam);
     kernel = gpuCopyKernelParam->kernel;

     CM_CHK_NULL_GOTOFINISH_CMERROR(kernel);

     CM_CHK_NULL_GOTOFINISH_CMERROR(cmbufferUPY);
     CM_CHK_NULL_GOTOFINISH_CMERROR(cmbufferUPUV);
     CM_CHK_CMSTATUS_GOTOFINISH(cmbufferUPY->GetIndex(bufferUPIndexY));
     CM_CHK_CMSTATUS_GOTOFINISH(cmbufferUPUV->GetIndex(bufferUPIndexUV));
     CM_CHK_CMSTATUS_GOTOFINISH(surface->GetIndex(surf2DIndexCM));

     threadWidth = (uint32_t)ceil((double)copyWidthByte / BLOCK_PIXEL_WIDTH / 4);
     threadHeight = (uint32_t)ceil((double)copyHeightRow / BLOCK_HEIGHT / INNER_LOOP);
     threadNum = threadWidth * threadHeight;
     CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetThreadCount(threadNum));
     CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateThreadSpace(threadWidth, threadHeight, threadSpace));

     widthDword = (uint32_t)ceil((double)widthByte / 4);
     strideInDwords = (uint32_t)ceil((double)strideInBytes / 4);

     if (direction == CM_FASTCOPY_CPU2GPU) //Write
     {
         //Input BufferUP_Y and BufferUP_UV
         if (cmHalState->cmHalInterface->IsSurfaceCompressionWARequired())
         {
             CM_CHK_CMSTATUS_GOTOFINISH(surface->SetCompressionMode(MEMCOMP_DISABLED));
         }
         CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg(0, sizeof(SurfaceIndex), bufferUPIndexY));
         CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg(1, sizeof(SurfaceIndex), bufferUPIndexUV));
         //Output Surface2D
         CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg(2, sizeof(SurfaceIndex), surf2DIndexCM));
         //Other parameters
         CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg(3, sizeof(uint32_t), &strideInDwords));
         CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg(4, sizeof(uint32_t), &heightStrideInRows));
         CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg(5, sizeof(uint32_t), &addedShiftLeftOffsetY));
         CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg(6, sizeof(uint32_t), &addedShiftLeftOffsetUV));
         CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg(7, sizeof(uint32_t), &threadHeight));
     }
     else  //Read
     {
         //Input Surface2D
         CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg(0, sizeof(SurfaceIndex), surf2DIndexCM));
         //Output BufferUP_Y and BufferUP_UV
         CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg(1, sizeof(SurfaceIndex), bufferUPIndexY));
         CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg(2, sizeof(SurfaceIndex), bufferUPIndexUV));
         //Other parameters
         CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg(3, sizeof(uint32_t), &strideInDwords));
         CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg(4, sizeof(uint32_t), &heightStrideInRows));
         CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg(5, sizeof(uint32_t), &addedShiftLeftOffsetY));
         CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg(6, sizeof(uint32_t), &addedShiftLeftOffsetUV));
         CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg(7, sizeof(uint32_t), &threadHeight));
         CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg(8, sizeof(uint32_t), &widthDword));
         CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg(9, sizeof(uint32_t), &heightInRow));

         surface->SetReadSyncFlag(true, this); // GPU -> CPU, set surf2d as read sync flag
     }

     CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateTask(gpuCopyTask));
     CM_CHK_CMSTATUS_GOTOFINISH(gpuCopyTask->AddKernel(kernel));
     if (option & CM_FASTCOPY_OPTION_DISABLE_TURBO_BOOST)
     {
         // disable turbo
         CM_TASK_CONFIG taskConfig;
         CmSafeMemSet(&taskConfig, 0, sizeof(CM_TASK_CONFIG));
         taskConfig.turboBoostFlag = CM_TURBO_BOOST_DISABLE;
         gpuCopyTask->SetProperty(taskConfig);
     }
     CM_CHK_CMSTATUS_GOTOFINISH(EnqueueFast(gpuCopyTask, internalEvent,
                                        threadSpace));

     GPUCOPY_KERNEL_UNLOCK(gpuCopyKernelParam);

     if ((option & CM_FASTCOPY_OPTION_BLOCKING) && (internalEvent))
     {
         CM_CHK_CMSTATUS_GOTOFINISH(internalEvent->WaitForTaskFinished());
     }

     if (event == CM_NO_EVENT)  //User doesn't need CmEvent for this copy
     {
         event = nullptr;
         CM_CHK_CMSTATUS_GOTOFINISH(DestroyEventFast(internalEvent));
     }
     else //User needs this CmEvent
     {
         event = internalEvent;
     }

     CM_CHK_CMSTATUS_GOTOFINISH(m_device->DestroyTask(gpuCopyTask));
     CM_CHK_CMSTATUS_GOTOFINISH(m_device->DestroyThreadSpace(threadSpace));
     CM_CHK_CMSTATUS_GOTOFINISH(m_device->DestroyBufferUP(cmbufferUPY));
     CM_CHK_CMSTATUS_GOTOFINISH(m_device->DestroyBufferUP(cmbufferUPUV));

 finish:

     if (hr != CM_SUCCESS)
     {
         if ((cmbufferUPY == nullptr) || (cmbufferUPUV == nullptr))
         {
             // user need to know whether the failure is caused by out of BufferUP.
             hr = CM_GPUCOPY_OUT_OF_RESOURCE;
         }

         if (kernel && gpuCopyKernelParam)        GPUCOPY_KERNEL_UNLOCK(gpuCopyKernelParam);
         if (threadSpace)                                m_device->DestroyThreadSpace(threadSpace);
         if (gpuCopyTask)                       m_device->DestroyTask(gpuCopyTask);
         if (cmbufferUPY)                      m_device->DestroyBufferUP(cmbufferUPY);
         if (cmbufferUPUV)                     m_device->DestroyBufferUP(cmbufferUPUV);
         if (internalEvent)                     DestroyEventFast(internalEvent);

         // CM_FAILURE for all the other errors
         // return CM_EXCEED_MAX_TIMEOUT to notify app that gpu reset happens
         if( hr != CM_GPUCOPY_OUT_OF_RESOURCE && hr != CM_EXCEED_MAX_TIMEOUT)
         {
             hr = CM_FAILURE;
         }
     }

     return hr;
 }

 //*-----------------------------------------------------------------------------
 //! Enqueue an task, which contains one pre-defined kernel to copy from video memory to video memory
 //! This is a non-blocking call. i.e. it returns immediately without waiting for
 //! GPU to finish the execution of the task.
 //! A CmEvent is generated each time a task is enqueued. The CmEvent can
 //! be used to check if the task finishes.
 //! INPUT:
 //!     1) Pointer to the CmSurface2D as copy destination
 //!     2) Pointer to the CmSurface2D  as copy source
 //!     3) Option passed from user, blocking copy, non-blocking copy or disable turbo boost
 //!     4) Reference to the pointer to CMEvent
 //! OUTPUT:
 //!     CM_SUCCESS if the task is successfully enqueued and the CmEvent is generated;
 //!     CM_OUT_OF_HOST_MEMORY if out of host memery;
 //!     CM_GPUCOPY_INVALID_SURFACES if input/output surfaces' width/format are different or
 //!                                 input surface's height is larger than output surface's
 //! Restrictions:
 //!     1) Surface's width should be 64-byte aligned.
 //!     2) The input surface's width/height/format should be the same as output surface's.
 //*-----------------------------------------------------------------------------
 CM_RT_API int32_t CmQueueRT::EnqueueCopyGPUToGPU( CmSurface2D* outputSurface, CmSurface2D* inputSurface, uint32_t option, CmEvent* & event )
 {
     INSERT_API_CALL_LOG();

     if (!m_device->HasGpuCopyKernel())
     {
         return CM_NOT_IMPLEMENTED;
     }

     uint32_t srcSurfaceWidth = 0;
     uint32_t srcSurfaceHeight = 0;
     uint32_t dstSurfaceWidth = 0;
     uint32_t dstSurfaceHeight = 0;

     CM_SURFACE_FORMAT srcSurfaceFormat = CM_SURFACE_FORMAT_INVALID;
     CM_SURFACE_FORMAT dstSurfaceFormat = CM_SURFACE_FORMAT_INVALID;

     int32_t             hr = CM_SUCCESS;
     uint32_t            srcSizePerPixel = 0;
     uint32_t            dstSizePerPixel = 0;
     uint32_t            threadWidth = 0;
     uint32_t            threadHeight = 0;

     CmKernel            *kernel = nullptr;
     SurfaceIndex        *surfaceInputIndex = nullptr;
     SurfaceIndex        *surfaceOutputIndex = nullptr;
     CmThreadSpace       *threadSpace = nullptr;
     CmTask              *task = nullptr;
     uint32_t            srcSurfAlignedWidthInBytes = 0;
     CM_GPUCOPY_KERNEL *gpuCopyKernelParam = nullptr;

     if ((outputSurface == nullptr) || (inputSurface == nullptr))
     {
         CM_ASSERTMESSAGE("Error: Pointer to input surface or output surface is null.");
         return CM_FAILURE;
     }

     PCM_HAL_STATE   cmHalState = ((PCM_CONTEXT_DATA)m_device->GetAccelData())->cmHalState;
     CmSurface2DRT *outputSurfaceRT = static_cast<CmSurface2DRT *>(outputSurface);
     CmSurface2DRT *inputSurfaceRT = static_cast<CmSurface2DRT *>(inputSurface);
     if (cmHalState->cmHalInterface->IsSurfaceCompressionWARequired())
     {
         CM_CHK_CMSTATUS_GOTOFINISH(outputSurfaceRT->SetCompressionMode(MEMCOMP_DISABLED));
     }

     CM_CHK_CMSTATUS_GOTOFINISH(outputSurfaceRT->GetSurfaceDesc(dstSurfaceWidth, dstSurfaceHeight, dstSurfaceFormat, dstSizePerPixel));
     CM_CHK_CMSTATUS_GOTOFINISH(inputSurfaceRT->GetSurfaceDesc(srcSurfaceWidth, srcSurfaceHeight, srcSurfaceFormat, srcSizePerPixel));

     if ((dstSurfaceWidth != srcSurfaceWidth) ||
         (dstSurfaceHeight < srcSurfaceHeight) ||  //relax the restriction
         (dstSizePerPixel != srcSizePerPixel))
     {
         CM_ASSERTMESSAGE("Error: Size of dest surface does not match src surface.");
         return CM_GPUCOPY_INVALID_SURFACES;
     }

     //To support copy b/w Format_A8R8G8B8 and Format_A8B8G8R8
     if (dstSurfaceFormat != srcSurfaceFormat)
     {
         if (!((dstSurfaceFormat == CM_SURFACE_FORMAT_A8R8G8B8) && (srcSurfaceFormat == CM_SURFACE_FORMAT_A8B8G8R8)) &&
             !((dstSurfaceFormat == CM_SURFACE_FORMAT_A8R8G8B8) && (srcSurfaceFormat == CM_SURFACE_FORMAT_A8B8G8R8)))
         {
             CM_ASSERTMESSAGE("Error: Only support copy b/w Format_A8R8G8B8 and Format_A8B8G8R8 if src format is not matched with dst format.");
             return CM_GPUCOPY_INVALID_SURFACES;
         }
     }

     // 128Bytes aligned
     srcSurfAlignedWidthInBytes = (uint32_t)(ceil((double)srcSurfaceWidth*srcSizePerPixel / BLOCK_PIXEL_WIDTH / 4) * (BLOCK_PIXEL_WIDTH * 4));

     if (srcSurfaceHeight > CM_MAX_THREADSPACE_WIDTH_FOR_MW *BLOCK_HEIGHT *INNER_LOOP)
     {
         CM_ASSERTMESSAGE("Error: Invalid copy size.");
         return CM_GPUCOPY_INVALID_SIZE;
     }

     CM_CHK_CMSTATUS_GOTOFINISH(CreateGPUCopyKernel(srcSurfaceWidth*srcSizePerPixel, srcSurfaceHeight, srcSurfaceFormat, CM_FASTCOPY_GPU2GPU, gpuCopyKernelParam));
     CM_CHK_NULL_GOTOFINISH_CMERROR(gpuCopyKernelParam);

     CM_CHK_NULL_GOTOFINISH_CMERROR(gpuCopyKernelParam->kernel);
     kernel = gpuCopyKernelParam->kernel;

     CM_CHK_CMSTATUS_GOTOFINISH(inputSurface->GetIndex(surfaceInputIndex));
     CM_CHK_CMSTATUS_GOTOFINISH(outputSurface->GetIndex(surfaceOutputIndex));

     threadWidth = srcSurfAlignedWidthInBytes / (BLOCK_PIXEL_WIDTH * 4);
     threadHeight = (uint32_t)ceil((double)srcSurfaceHeight / BLOCK_HEIGHT / INNER_LOOP);

     CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetThreadCount(threadWidth * threadHeight));

     CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg(0, sizeof(SurfaceIndex), surfaceInputIndex));
     CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg(1, sizeof(SurfaceIndex), surfaceOutputIndex));
     CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg(2, sizeof(uint32_t), &threadHeight));

     CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateThreadSpace(threadWidth, threadHeight, threadSpace));

     CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateTask(task));
     CM_CHK_NULL_GOTOFINISH_CMERROR(task);
     CM_CHK_CMSTATUS_GOTOFINISH(task->AddKernel(kernel));

     if (option & CM_FASTCOPY_OPTION_DISABLE_TURBO_BOOST)
     {
         // disable turbo
         CM_TASK_CONFIG taskConfig;
         CmSafeMemSet(&taskConfig, 0, sizeof(CM_TASK_CONFIG));
         taskConfig.turboBoostFlag = CM_TURBO_BOOST_DISABLE;
         task->SetProperty(taskConfig);
     }

     CM_CHK_CMSTATUS_GOTOFINISH(EnqueueFast(task, event, threadSpace));
     if ((option & CM_FASTCOPY_OPTION_BLOCKING) && (event))
     {
         CM_CHK_CMSTATUS_GOTOFINISH(event->WaitForTaskFinished());
     }

 finish:

     if (kernel && gpuCopyKernelParam)        GPUCOPY_KERNEL_UNLOCK(gpuCopyKernelParam);
     if (threadSpace)                                m_device->DestroyThreadSpace(threadSpace);
     if (task)                              m_device->DestroyTask(task);

     return hr;
 }

 //*-----------------------------------------------------------------------------
 //! Enqueue an task, which contains one pre-defined kernel to copy from system memory to system memory
 //! This is a non-blocking call. i.e. it returns immediately without waiting for
 //! GPU to finish the execution of the task.
 //! A CmEvent is generated each time a task is enqueued. The CmEvent can be used to check if the task finishs.
 //! If the size is less than 1KB,  CPU is used to do the copy and event will be set as nullptr .
 //!
 //! INPUT:
 //!     1) Pointer to the system memory as copy destination
 //!     2) Pointer to the system memory as copy source
 //!     3) The size in bytes of memory be copied.
 //!     4) Option passed from user, blocking copy, non-blocking copy or disable turbo boost
 //!     5) Reference to the pointer to CMEvent
 //! OUTPUT:
 //!     CM_SUCCESS if the task is successfully enqueued and the CmEvent is generated;
 //!     CM_OUT_OF_HOST_MEMORY if out of host memery;
 //!     CM_GPUCOPY_INVALID_SYSMEM if the sysMem is not 16-byte aligned or is NULL.
 //!     CM_GPUCOPY_OUT_OF_RESOURCE if runtime run out of BufferUP.
 //!     CM_GPUCOPY_INVALID_SIZE  if its size plus shift-left offset large than CM_MAX_1D_SURF_WIDTH.
 //! Restrictions:
 //!     1) dstSysMem and srcSysMem should be 16-byte aligned.
 //*-----------------------------------------------------------------------------
 CM_RT_API int32_t CmQueueRT::EnqueueCopyCPUToCPU( unsigned char* dstSysMem, unsigned char* srcSysMem, uint32_t size, uint32_t option, CmEvent* & event )
 {
     INSERT_API_CALL_LOG();

     if (!m_device->HasGpuCopyKernel())
     {
         return CM_NOT_IMPLEMENTED;
     }

     int hr = CM_SUCCESS;
     size_t inputLinearAddress  = (size_t )srcSysMem;
     size_t outputLinearAddress = (size_t )dstSysMem;

     size_t inputLinearAddressAligned = 0;
     size_t outputLinearAddressAligned = 0;

     CmBufferUP      *surfaceInput          = nullptr;
     CmBufferUP      *surfaceOutput         = nullptr;
     CmKernel        *kernel                = nullptr;
     SurfaceIndex    *surfaceInputIndex     = nullptr;
     SurfaceIndex    *surfaceOutputIndex    = nullptr;
     CmThreadSpace   *threadSpace                    = nullptr;
     CmTask          *task                  = nullptr;

     int32_t         srcLeftShiftOffset      = 0;
     int32_t         dstLeftShiftOffset      = 0;
     uint32_t        threadWidth             = 0;
     uint32_t        threadHeight            = 0;
     uint32_t        threadNum              = 0;
     uint32_t        gpuMemcopySize        = 0;
     uint32_t        cpuMemcopySize        = 0;
     CM_GPUCOPY_KERNEL *gpuCopyKernelParam     = nullptr;

     if((inputLinearAddress & 0xf) || (outputLinearAddress & 0xf) ||
         (inputLinearAddress == 0) || (outputLinearAddress == 0))
     {
         CM_ASSERTMESSAGE("Error: Start address of system memory is not 16-byte aligned.");
         return CM_GPUCOPY_INVALID_SYSMEM;
     }

     // Get page aligned address
     if (sizeof (void *) == 8 ) //64-bit
     {
         inputLinearAddressAligned  = inputLinearAddress  & ADDRESS_PAGE_ALIGNMENT_MASK_X64;  // make sure the address page aligned.
         outputLinearAddressAligned = outputLinearAddress & ADDRESS_PAGE_ALIGNMENT_MASK_X64;  // make sure the address page aligned.
     }
     else
     {
         inputLinearAddressAligned  = inputLinearAddress  & ADDRESS_PAGE_ALIGNMENT_MASK_X86;  // make sure the address page aligned.
         outputLinearAddressAligned = outputLinearAddress & ADDRESS_PAGE_ALIGNMENT_MASK_X86;  // make sure the address page aligned.
     }

     srcLeftShiftOffset = (int32_t)(inputLinearAddress  - inputLinearAddressAligned) ;
     dstLeftShiftOffset = (int32_t)(outputLinearAddress - outputLinearAddressAligned) ;

     if(((size + srcLeftShiftOffset) > CM_MAX_1D_SURF_WIDTH)||
        ((size + dstLeftShiftOffset) > CM_MAX_1D_SURF_WIDTH))
     {
         CM_ASSERTMESSAGE("Error: Invalid copy size.");
         return CM_GPUCOPY_INVALID_SIZE;
     }

     threadWidth  = 0;
     threadHeight = 0;
     threadNum = size / BYTE_COPY_ONE_THREAD; // each thread copys 32 x 4 x32 bytes = 1K

     if( threadNum == 0)
     {
         //if the size of data is less than data copied per thread ( 4K), use CPU to copy it instead of GPU.
         CmFastMemCopy((void *)(outputLinearAddress),
                       (void *)(inputLinearAddress),
                       size); //SSE copy used in CMRT.

         event = nullptr;
         return CM_SUCCESS;
     }

     //Calculate proper thread space's width and height
     threadWidth  = 1;
     threadHeight = threadNum/threadWidth;
     while((threadHeight > CM_MAX_THREADSPACE_HEIGHT_FOR_MW))
     {
         if(threadWidth > CM_MAX_THREADSPACE_WIDTH_FOR_MW)
         {
             hr = CM_GPUCOPY_INVALID_SIZE; // thread number exceed 511*511
             goto finish;
         }
         else if (threadWidth == 1)
         {
             threadWidth  =  THREAD_SPACE_WIDTH_INCREMENT; // first time,
             threadHeight = threadNum/threadWidth;
         }
         else
         {
             threadWidth +=  THREAD_SPACE_WIDTH_INCREMENT; // increase 8 per iteration
             threadHeight = threadNum/threadWidth;
         }
     }

     CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateBufferUP(size + srcLeftShiftOffset, (void *)inputLinearAddressAligned,surfaceInput));

     CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateBufferUP(size + dstLeftShiftOffset, (void *)outputLinearAddressAligned,surfaceOutput));

     CM_CHK_CMSTATUS_GOTOFINISH(CreateGPUCopyKernel(size, 0, CM_SURFACE_FORMAT_INVALID, CM_FASTCOPY_CPU2CPU, gpuCopyKernelParam));
     CM_CHK_NULL_GOTOFINISH_CMERROR(gpuCopyKernelParam);
     CM_CHK_NULL_GOTOFINISH_CMERROR(gpuCopyKernelParam->kernel);
     kernel = gpuCopyKernelParam->kernel;

     CM_CHK_NULL_GOTOFINISH_CMERROR(surfaceInput);
     CM_CHK_CMSTATUS_GOTOFINISH(surfaceInput->GetIndex(surfaceInputIndex));
     CM_CHK_NULL_GOTOFINISH_CMERROR(surfaceOutput);
     CM_CHK_CMSTATUS_GOTOFINISH(surfaceOutput->GetIndex(surfaceOutputIndex));

     CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetThreadCount(threadWidth * threadHeight));
     CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 0, sizeof( SurfaceIndex ), surfaceInputIndex ));
     CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 1, sizeof( SurfaceIndex ), surfaceOutputIndex ));
     CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 2, sizeof( int ), &threadWidth ));
     CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 3, sizeof( int ), &threadHeight ));
     CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 4, sizeof( int ), &srcLeftShiftOffset ));
     CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 5, sizeof( int ), &dstLeftShiftOffset ));
     CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 6, sizeof( int ), &size ));

     CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateThreadSpace(threadWidth, threadHeight, threadSpace));

     CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateTask(task));
     CM_CHK_NULL_GOTOFINISH_CMERROR(task);
     CM_CHK_CMSTATUS_GOTOFINISH(task->AddKernel (kernel));

     if (option & CM_FASTCOPY_OPTION_DISABLE_TURBO_BOOST)
     {
         // disable turbo
         CM_TASK_CONFIG taskConfig;
         CmSafeMemSet(&taskConfig, 0, sizeof(CM_TASK_CONFIG));
         taskConfig.turboBoostFlag = CM_TURBO_BOOST_DISABLE;
         task->SetProperty(taskConfig);
     }

     CM_CHK_CMSTATUS_GOTOFINISH(EnqueueFast(task, event, threadSpace));

     if ((option & CM_FASTCOPY_OPTION_BLOCKING) && (event))
     {
         CM_CHK_CMSTATUS_GOTOFINISH(event->WaitForTaskFinished());
     }

     //Copy the unaligned part by using CPU
     gpuMemcopySize = threadHeight * threadWidth *BYTE_COPY_ONE_THREAD;
     cpuMemcopySize = size - threadHeight * threadWidth *BYTE_COPY_ONE_THREAD;

     CmFastMemCopy((void *)(outputLinearAddress+gpuMemcopySize),
                   (void *)(inputLinearAddress+gpuMemcopySize),
                           cpuMemcopySize); //SSE copy used in CMRT.

     CM_CHK_CMSTATUS_GOTOFINISH(m_device->DestroyThreadSpace(threadSpace));
     CM_CHK_CMSTATUS_GOTOFINISH(m_device->DestroyTask(task));
     CM_CHK_CMSTATUS_GOTOFINISH(m_device->DestroyBufferUP(surfaceOutput));   // ref_cnf to guarantee task finish before BufferUP being really destroy.
     CM_CHK_CMSTATUS_GOTOFINISH(m_device->DestroyBufferUP(surfaceInput));

     GPUCOPY_KERNEL_UNLOCK(gpuCopyKernelParam);

 finish:
     if(hr != CM_SUCCESS)
     {   //Failed
         if( surfaceInput == nullptr || surfaceOutput == nullptr)
         {
             hr = CM_GPUCOPY_OUT_OF_RESOURCE; // user need to know whether the failure is caused by out of BufferUP.
         }
         else
         {
             hr = CM_FAILURE;
         }
         if(surfaceInput)                      m_device->DestroyBufferUP(surfaceInput);
         if(surfaceOutput)                     m_device->DestroyBufferUP(surfaceOutput);
         if(kernel && gpuCopyKernelParam)        GPUCOPY_KERNEL_UNLOCK(gpuCopyKernelParam);
         if(threadSpace)                                m_device->DestroyThreadSpace(threadSpace);
         if(task)                              m_device->DestroyTask(task);
     }

     return hr;
 }

 //*----------------------------------------------------------------------------------------
 //| Purpose:    Pop task from flushed Queue, Update surface state and Destroy the task
 //| Notes:
 //*----------------------------------------------------------------------------------------
 void CmQueueRT::PopTaskFromFlushedQueue()
 {
     CmTaskInternal* topTask = (CmTaskInternal*)m_flushedTasks.Pop();

     if ( topTask != nullptr )
     {
         CmEventRT *event = nullptr;
         topTask->GetTaskEvent( event );
         if ( event != nullptr )
         {
             LARGE_INTEGER nTime;
             if ( !(MOS_QueryPerformanceCounter( (uint64_t*)&nTime.QuadPart )) )
             {
                 CM_ASSERTMESSAGE("Error: Query performace counter failure.");
             }
             else
             {
                 event->SetCompleteTime( nTime );
             }
         }

 #if MDF_SURFACE_CONTENT_DUMP
         PCM_CONTEXT_DATA cmData = (PCM_CONTEXT_DATA)m_device->GetAccelData();
         if (cmData->cmHalState->dumpSurfaceContent)
         {
             int32_t taskId = 0;
             if (event != nullptr)
             {
                 event->GetTaskDriverId(taskId);
             }
             topTask->SurfaceDump(taskId);
         }
 #endif

         CmTaskInternal::Destroy( topTask );
     }
     return;
 }

 int32_t CmQueueRT::TouchFlushedTasks( )
 {
     int32_t hr = CM_SUCCESS;

     if (m_flushedTasks.IsEmpty())
     {
         if (!m_enqueuedTasks.IsEmpty())
         {
             // if FlushedQueue is empty and EnqueuedQueue is not empty
             // try flush task to FlushedQueue
             hr = FlushTaskWithoutSync();
             if (FAILED(hr))
             {
                 return hr;
             }
         }
         else
         {   // no task in flushedQueue and EnqueuedQueue
             return CM_FAILURE;
         }
     }

     // Flush FlushedQueue
     hr = QueryFlushedTasks();

     return hr;
 }

 //*-----------------------------------------------------------------------------
 //! Flush the queue, i.e. submit all tasks in the queue to execute according
 //! to their order in the the queue. The queue will be empty after flush,
 //! This is a non-blocking call. i.e. it returns immediately without waiting for
 //! GPU to finish the execution of tasks.
 //! INPUT:
 //! OUTPUT:
 //!     CM_SUCCESS if all tasks in the queue are submitted
 //!     CM_FAILURE otherwise.
 //!     More error code is coming.
 //!
 //*-----------------------------------------------------------------------------
 int32_t CmQueueRT::QueryFlushedTasks()
 {
     int32_t hr   = CM_SUCCESS;

     m_criticalSectionFlushedTask.Acquire();
     while( !m_flushedTasks.IsEmpty() )
     {
         CmTaskInternal* task = (CmTaskInternal*)m_flushedTasks.Top();
         CM_CHK_NULL_GOTOFINISH_CMERROR(task);

         CM_STATUS status = CM_STATUS_FLUSHED ;
         task->GetTaskStatus(status);
         if( status == CM_STATUS_FINISHED )
         {
             PopTaskFromFlushedQueue();
         }
         else
         {
             // media reset
             if (status == CM_STATUS_RESET)
             {
                 PCM_CONTEXT_DATA cmData = (PCM_CONTEXT_DATA)m_device->GetAccelData();

                 // Clear task status table in Cm Hal State
                 int32_t taskId;
                 CmEventRT*pTopTaskEvent;
                 task->GetTaskEvent(pTopTaskEvent);
                 CM_CHK_NULL_GOTOFINISH_CMERROR(pTopTaskEvent);

                 pTopTaskEvent->GetTaskDriverId(taskId);
                 cmData->cmHalState->taskStatusTable[taskId] = CM_INVALID_INDEX;

                 //Pop task and Destroy it
                 PopTaskFromFlushedQueue();
             }

             // It is an in-order queue, if this one hasn't finshed,
             // the following ones haven't finished either.
             break;
         }
     }

 finish:
     m_criticalSectionFlushedTask.Release();

     return hr;
 }

 //*-----------------------------------------------------------------------------
 //! This is a blocking call. It will NOT return untill
 //! all tasks in GPU and all tasks in queue finishes execution.
 //! It will first flush the queue if the queue is not empty.
 //! INPUT:
 //! OUTPUT:
 //!     CM_SUCCESS if all tasks finish execution.
 //!     CM_FAILURE otherwise.
 //!     More error code is coming.
 //*-----------------------------------------------------------------------------
 CM_RT_API int32_t CmQueueRT::DestroyEvent( CmEvent* & event )
 {

     CLock Lock(m_criticalSectionEvent);

     if (event == nullptr)
     {
         return CM_FAILURE;
     }

     uint32_t index = 0;

     CmEventRT *eventRT = dynamic_cast<CmEventRT *>(event);
     if (eventRT == nullptr)
     {
         return DestroyEventFast(event);
     }
     eventRT->GetIndex(index);
     CM_ASSERT( m_eventArray.GetElement( index ) == eventRT );

     int32_t status = CmEventRT::Destroy( eventRT );
     if( status == CM_SUCCESS && eventRT == nullptr)
     {
         m_eventArray.SetElement(index, nullptr);
     }

     // Should return nullptr to application even the event is not destroyed
     // since its reference count is not zero
     event = nullptr;

     return status;
 }

 //*-----------------------------------------------------------------------------
 //| Purpose:    Clean the Queue if its tasks time out
 //| Returns:    Result of the operation.
 //*-----------------------------------------------------------------------------
 int32_t CmQueueRT::CleanQueue( )
 {

     int32_t status = CM_SUCCESS;

     // Maybe not necessary since
     // it is called by ~CmDevice only
     // Update: necessary because it calls FlushBlockWithoutSync
     if( !m_enqueuedTasks.IsEmpty() )
     {
         // If there are tasks not flushed (i.e. not send to driver )
         // wait untill all such tasks are flushed
         FlushTaskWithoutSync( true );
     }
     CM_ASSERT( m_enqueuedTasks.IsEmpty() );

     //Used for timeout detection
     LARGE_INTEGER freq;
     MOS_QueryPerformanceFrequency((uint64_t*)&freq.QuadPart);
     LARGE_INTEGER start;
     MOS_QueryPerformanceCounter((uint64_t*)&start.QuadPart);
     int64_t timeout = start.QuadPart + (CM_MAX_TIMEOUT * freq.QuadPart * m_flushedTasks.GetCount()); //Count to timeout at

     while( !m_flushedTasks.IsEmpty() && status != CM_EXCEED_MAX_TIMEOUT )
     {
         QueryFlushedTasks();

         LARGE_INTEGER current;
         MOS_QueryPerformanceCounter((uint64_t*)&current.QuadPart);
         if( current.QuadPart > timeout )
             status = CM_EXCEED_MAX_TIMEOUT;
     }

     return status;
 }

 CM_QUEUE_CREATE_OPTION &CmQueueRT::GetQueueOption()
 {
     return m_queueOption;
 }

 //*-----------------------------------------------------------------------------
 //| Purpose:    Get the count of task in queue
 //| Returns:    Result of the operation.
 //*-----------------------------------------------------------------------------
 int32_t CmQueueRT::GetTaskCount( uint32_t& numTasks )
 {
     numTasks = m_enqueuedTasks.GetCount() + m_flushedTasks.GetCount();
     return CM_SUCCESS;
 }

 //*-----------------------------------------------------------------------------
 //| Purpose:   Use GPU to init Surface2D
 //| Returns:   result of operation
 //*-----------------------------------------------------------------------------
 CM_RT_API int32_t CmQueueRT::EnqueueInitSurface2D( CmSurface2D* surf2D, const uint32_t initValue, CmEvent* &event)
 {
     INSERT_API_CALL_LOG();

     if (!m_device->HasGpuInitKernel())
     {
         return CM_NOT_IMPLEMENTED;
     }

     int32_t         hr                      = CM_SUCCESS;
     uint32_t        width                   = 0;
     uint32_t        height                  = 0;
     uint32_t        sizePerPixel            = 0;
     CmProgram       *gpuInitKernelProgram  = nullptr;
     CmKernel        *kernel                = nullptr;
     SurfaceIndex    *outputIndexCM         = nullptr;
     CmThreadSpace   *threadSpace           = nullptr;
     CmTask          *gpuCopyTask           = nullptr;
     uint32_t        threadWidth             = 0;
     uint32_t        threadHeight            = 0;
     uint32_t        threadNum               = 0;
     CmSurfaceManager* surfaceMgr           = nullptr;
     CM_SURFACE_FORMAT      format           = CM_SURFACE_FORMAT_INVALID;

     if(!surf2D)
     {
         CM_ASSERTMESSAGE("Error: Pointer to surface 2d is null.");
         return CM_FAILURE;
     }
     CmSurface2DRT *surf2DRT = static_cast<CmSurface2DRT *>(surf2D);

     CM_CHK_CMSTATUS_GOTOFINISH(m_device->LoadPredefinedInitKernel(gpuInitKernelProgram));

     CM_CHK_CMSTATUS_GOTOFINISH(surf2DRT->GetSurfaceDesc(width, height, format,sizePerPixel));

     m_device->GetSurfaceManager(surfaceMgr);
     CM_CHK_NULL_GOTOFINISH_CMERROR(surfaceMgr);

     if (format == CM_SURFACE_FORMAT_NV12 || format == CM_SURFACE_FORMAT_P010 || format == CM_SURFACE_FORMAT_P016)
     {
         CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateKernel( gpuInitKernelProgram, _NAME( surfaceCopy_set_NV12 ), kernel, "PredefinedGPUCopyKernel"));
     }
     else
     {
         CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateKernel( gpuInitKernelProgram, _NAME( surfaceCopy_set ), kernel, "PredefinedGPUCopyKernel" ));
     }
     CM_CHK_NULL_GOTOFINISH_CMERROR(kernel);
     CM_CHK_CMSTATUS_GOTOFINISH(surf2D->GetIndex( outputIndexCM ));

     threadWidth = ( uint32_t )ceil( ( double )width*sizePerPixel/BLOCK_PIXEL_WIDTH/4 );
     threadHeight = ( uint32_t )ceil( ( double )height/BLOCK_HEIGHT );
     threadNum = threadWidth * threadHeight;
     CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetThreadCount( threadNum ));

     CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateThreadSpace( threadWidth, threadHeight, threadSpace ));
     CM_CHK_NULL_GOTOFINISH_CMERROR(threadSpace);

     CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 0, sizeof( uint32_t ), &initValue ));
     CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 1, sizeof( SurfaceIndex ), outputIndexCM ));

     CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateTask(gpuCopyTask));
     CM_CHK_NULL_GOTOFINISH_CMERROR(gpuCopyTask);

     CM_CHK_CMSTATUS_GOTOFINISH(gpuCopyTask->AddKernel( kernel ));

     CM_CHK_CMSTATUS_GOTOFINISH(EnqueueFast(gpuCopyTask, event, threadSpace));

 finish:

     if (kernel)        m_device->DestroyKernel( kernel );
     if (gpuCopyTask)   m_device->DestroyTask(gpuCopyTask);
     if (threadSpace)            m_device->DestroyThreadSpace(threadSpace);

     return hr;
 }

 //*-----------------------------------------------------------------------------
 //! Flush a geneal task to HAL CM layer for execution.
 //! This is a non-blocking call. i.e. it returs immediately without waiting for
 //! GPU to finish the execution of tasks.
 //! INPUT: task -- Pointer to CmTaskInternal object
 //! OUTPUT:
 //!     CM_SUCCESS if all tasks in the queue are submitted
 //!     CM_FAILURE otherwise.
 //*-----------------------------------------------------------------------------
 int32_t CmQueueRT::FlushGeneralTask(CmTaskInternal* task)
 {
     CM_RETURN_CODE          hr              = CM_SUCCESS;
     CM_HAL_EXEC_TASK_PARAM  param;
     PCM_HAL_KERNEL_PARAM    kernelParam    = nullptr;
     CmKernelData*           kernelData     = nullptr;
     uint32_t                kernelDataSize  = 0;
     PCM_CONTEXT_DATA        cmData         = nullptr;
     CmEventRT*              event          = nullptr;
     uint32_t                totalThreadCount= 0;
     uint32_t                count           = 0;
     PCM_HAL_KERNEL_PARAM    tempData       = nullptr;
     uint32_t                maxTSWidth      = 0;
     bool                    hasThreadArg    = false;

     CmSafeMemSet( &param, 0, sizeof( CM_HAL_EXEC_TASK_PARAM ) );

     //GT-PIN
     if(m_device->CheckGTPinEnabled())
     {
         CM_CHK_CMSTATUS_GOTOFINISH(task->GetKernelSurfInfo(param.surfEntryInfoArrays));
     }

     task->GetKernelCount( count );
     param.numKernels = count;

     param.kernels = MOS_NewArray(PCM_HAL_KERNEL_PARAM,count);
     param.kernelSizes = MOS_NewArray(uint32_t,count);
     param.kernelCurbeOffset = MOS_NewArray(uint32_t,count);
     param.queueOption = m_queueOption;

     CM_CHK_NULL_GOTOFINISH(param.kernels, CM_OUT_OF_HOST_MEMORY);
     CM_CHK_NULL_GOTOFINISH(param.kernelSizes, CM_OUT_OF_HOST_MEMORY);
     CM_CHK_NULL_GOTOFINISH(param.kernelCurbeOffset, CM_OUT_OF_HOST_MEMORY);

     for( uint32_t i = 0; i < count; i ++ )
     {
         task->GetKernelData( i, kernelData );
         CM_CHK_NULL_GOTOFINISH_CMERROR(kernelData);

         kernelParam = kernelData->GetHalCmKernelData();
         CM_CHK_NULL_GOTOFINISH_CMERROR(kernelParam);

         hasThreadArg |= kernelParam->perThreadArgExisted;

         task->GetKernelDataSize( i, kernelDataSize );
         if(kernelDataSize == 0)
         {
             CM_ASSERTMESSAGE("Error: Invalid kernel data size.");
             hr = CM_FAILURE;
             goto finish;
         }

         tempData = kernelData->GetHalCmKernelData();

         param.kernels[ i ]             = tempData;
         param.kernelSizes[ i ]        = kernelDataSize;
         param.kernelCurbeOffset[ i ]  = task->GetKernelCurbeOffset(i);
         param.globalSurfaceUsed       |= tempData->globalSurfaceUsed;
         param.kernelDebugEnabled      |= tempData->kernelDebugEnabled;
     }

     /*
     * Preset the default TS width/height/dependency:
     *     TS width   = MOS_MIN(CM_MAX_THREADSPACE_WIDTH, threadcount)
     *     TS height  = totalThreadCount/CM_MAX_THREADSPACE_WIDTH + 1
     *     dependency = CM_NONE_DEPENDENCY
     * For threadSpace is nullptr case, we will pass the default TS width/height/dependency to driver
     * For threadSpace is valid case, the TS width/height/dependency will be update according to thread space set by user.
     */
     task->GetTotalThreadCount(totalThreadCount);

     if (hasThreadArg)
     {
         maxTSWidth = CM_MAX_THREADSPACE_WIDTH_FOR_MW + 1; // 512 allowed for media object
     }
     else
     {
         maxTSWidth = CM_MAX_THREADSPACE_WIDTH_FOR_MW; // 511 for media walker
     }

     param.threadSpaceWidth = (totalThreadCount > maxTSWidth) ? maxTSWidth : totalThreadCount;
     if(totalThreadCount%maxTSWidth)
     {
         param.threadSpaceHeight = totalThreadCount/maxTSWidth + 1;
     }
     else
     {
         param.threadSpaceHeight = totalThreadCount/maxTSWidth;
     }

     param.dependencyPattern = CM_NONE_DEPENDENCY;

     if (task->IsThreadSpaceCreated()) //scoreboard data preparation
     {
         if(task->IsThreadCoordinatesExisted())
         {
             param.threadCoordinates = MOS_NewArray(PCM_HAL_SCOREBOARD, count);
             param.dependencyMasks = MOS_NewArray(PCM_HAL_MASK_AND_RESET, count);

             CM_CHK_NULL_GOTOFINISH(param.threadCoordinates, CM_OUT_OF_HOST_MEMORY);
             CM_CHK_NULL_GOTOFINISH(param.dependencyMasks, CM_OUT_OF_HOST_MEMORY);
             for(uint32_t i=0; i<count; i++)
             {
                 void *kernelCoordinates = nullptr;
                 void *dependencyMasks = nullptr;
                 task->GetKernelCoordinates(i, kernelCoordinates);
                 task->GetKernelDependencyMasks(i, dependencyMasks);
                 param.threadCoordinates[i] = (PCM_HAL_SCOREBOARD)kernelCoordinates;
                 param.dependencyMasks[i] = (PCM_HAL_MASK_AND_RESET)dependencyMasks;
             }
         }
         else
         {
             param.threadCoordinates = nullptr;
         }

         task->GetDependencyPattern(param.dependencyPattern);

         task->GetThreadSpaceSize(param.threadSpaceWidth, param.threadSpaceHeight);

         task->GetWalkingPattern(param.walkingPattern);

         if( task->CheckWalkingParametersSet( ) )
         {
             param.walkingParamsValid = 1;
             CM_CHK_CMSTATUS_GOTOFINISH(task->GetWalkingParameters(param.walkingParams));
         }
         else
         {
             param.walkingParamsValid = 0;
         }

         if( task->CheckDependencyVectorsSet( ) )
         {
             param.dependencyVectorsValid = 1;
             CM_CHK_CMSTATUS_GOTOFINISH(task->GetDependencyVectors(param.dependencyVectors));
         }
         else
         {
             param.dependencyVectorsValid = 0;
         }
     }
     if (param.threadSpaceWidth == 0)
     {
         CM_ASSERTMESSAGE("Error: Invalid thread space.");
         hr = CM_INVALID_THREAD_SPACE;
         goto finish;
     }
     task->GetColorCountMinusOne(param.colorCountMinusOne);
     task->GetMediaWalkerGroupSelect(param.mediaWalkerGroupSelect);

     param.syncBitmap = task->GetSyncBitmap();
     param.conditionalEndBitmap = task->GetConditionalEndBitmap();
     param.userDefinedMediaState = task->GetMediaStatePtr();
     CmSafeMemCopy(param.conditionalEndInfo, task->GetConditionalEndInfo(), sizeof(param.conditionalEndInfo));
     CmSafeMemCopy(&param.taskConfig, task->GetTaskConfig(), sizeof(param.taskConfig));
     cmData = (PCM_CONTEXT_DATA)m_device->GetAccelData();

     CM_CHK_MOSSTATUS_GOTOFINISH_CMERROR(cmData->cmHalState->pfnSetPowerOption(cmData->cmHalState, task->GetPowerOption()));

     cmData->cmHalState->osInterface->pfnSetGpuContext(cmData->cmHalState->osInterface, (MOS_GPU_CONTEXT)m_queueOption.GPUContext);
     RegisterSyncEvent();

     CM_CHK_MOSSTATUS_GOTOFINISH_CMERROR(cmData->cmHalState->pfnExecuteTask(cmData->cmHalState, &param));

     if( param.taskIdOut < 0 )
     {
         CM_ASSERTMESSAGE("Error: Invalid task ID.");
         hr = CM_FAILURE;
         goto finish;
     }

     TASK_LOG(task);

     task->GetTaskEvent( event );
     CM_CHK_NULL_GOTOFINISH_CMERROR(event);
     CM_CHK_CMSTATUS_GOTOFINISH(event->SetTaskDriverId( param.taskIdOut ));
     CM_CHK_CMSTATUS_GOTOFINISH(event->SetTaskOsData( param.osData ));
     CM_CHK_CMSTATUS_GOTOFINISH(task->ResetKernelDataStatus());

     //GT-PIN
     if(m_device->CheckGTPinEnabled())
     {
         //No need to clear the SurEntryInfoArrays here. It will be destored by CmInternalTask
         CM_CHK_CMSTATUS_GOTOFINISH(event->SetSurfaceDetails(param.surfEntryInfoArrays));
     }

 finish:
     MosSafeDeleteArray( param.kernels );
     MosSafeDeleteArray( param.kernelSizes );
     MosSafeDeleteArray( param.threadCoordinates);
     MosSafeDeleteArray( param.dependencyMasks);
     MosSafeDeleteArray( param.kernelCurbeOffset);

     return hr;
 }

 //*-----------------------------------------------------------------------------
 //! Flush a thread group based task to HAL CM layer for execution.
 //! This is a non-blocking call. i.e. it returs immediately without waiting for
 //! GPU to finish the execution of tasks.
 //! INPUT: task -- Pointer to CmTaskInternal object
 //! OUTPUT:
 //!     CM_SUCCESS if all tasks in the queue are submitted
 //!     CM_FAILURE otherwise.
 //*-----------------------------------------------------------------------------
 int32_t CmQueueRT::FlushGroupTask(CmTaskInternal* task)
 {
     CM_RETURN_CODE  hr          = CM_SUCCESS;

     CM_HAL_EXEC_TASK_GROUP_PARAM param;
     CmKernelData* kernelData   = nullptr;
     uint32_t kernelDataSize        = 0;
     uint32_t count                  = 0;
     PCM_CONTEXT_DATA cmData    = nullptr;
     CmEventRT * event          = nullptr;
     PCM_HAL_KERNEL_PARAM tempData  = nullptr;

     CmSafeMemSet( &param, 0, sizeof( CM_HAL_EXEC_TASK_GROUP_PARAM ) );

     //GT-PIN
     if(this->m_device->CheckGTPinEnabled())
     {
         CM_CHK_CMSTATUS_GOTOFINISH(task->GetKernelSurfInfo(param.surEntryInfoArrays));
     }

     task->GetKernelCount( count );
     param.numKernels = count;

     param.kernels = MOS_NewArray(PCM_HAL_KERNEL_PARAM, count);
     param.kernelSizes = MOS_NewArray(uint32_t, count);
     param.kernelCurbeOffset = MOS_NewArray(uint32_t, count);
     param.queueOption = m_queueOption;
     param.mosVeHintParams = (m_usingVirtualEngine)? &m_mosVeHintParams: nullptr;

     CmSafeMemCopy(&param.taskConfig, task->GetTaskConfig(), sizeof(param.taskConfig));
     CM_CHK_NULL_GOTOFINISH_CMERROR(param.kernels);
     CM_CHK_NULL_GOTOFINISH_CMERROR(param.kernelSizes);
     CM_CHK_NULL_GOTOFINISH_CMERROR(param.kernelCurbeOffset);

     for( uint32_t i = 0; i < count; i ++ )
     {
         task->GetKernelData( i, kernelData );
         CM_CHK_NULL_GOTOFINISH_CMERROR(kernelData);

         task->GetKernelDataSize( i, kernelDataSize );
         if( kernelDataSize == 0)
         {
             CM_ASSERTMESSAGE("Error: Invalid kernel data size.");
             hr = CM_FAILURE;
             goto finish;
         }

         tempData = kernelData->GetHalCmKernelData( );

         param.kernels[ i ]             = tempData;
         param.kernelSizes[ i ]        = kernelDataSize;
         param.kernelCurbeOffset [ i ] = task->GetKernelCurbeOffset(i);
         param.globalSurfaceUsed        |= tempData->globalSurfaceUsed;
         param.kernelDebugEnabled       |= tempData->kernelDebugEnabled;
     }

     task->GetSLMSize(param.slmSize);
     if(param.slmSize > MAX_SLM_SIZE_PER_GROUP_IN_1K)
     {
         CM_ASSERTMESSAGE("Error: SLM size exceeds the maximum per group.");
         hr = CM_EXCEED_MAX_SLM_SIZE;
         goto finish;
     }

     if (task->IsThreadGroupSpaceCreated())//thread group size
     {
         task->GetThreadGroupSpaceSize(param.threadSpaceWidth, param.threadSpaceHeight,
                                       param.threadSpaceDepth, param.groupSpaceWidth,
                                       param.groupSpaceHeight, param.groupSpaceDepth);
     }

     param.syncBitmap = task->GetSyncBitmap();
     param.conditionalEndBitmap = task->GetConditionalEndBitmap();
     param.userDefinedMediaState = task->GetMediaStatePtr();
     CmSafeMemCopy(param.conditionalEndInfo, task->GetConditionalEndInfo(), sizeof(param.conditionalEndInfo));
     CmSafeMemCopy(param.krnExecCfg, task->GetKernelExecuteConfig(), sizeof(param.krnExecCfg));

     // Call HAL layer to execute pfnExecuteGroupTask
     cmData = (PCM_CONTEXT_DATA)m_device->GetAccelData();

     CM_CHK_MOSSTATUS_GOTOFINISH_CMERROR( cmData->cmHalState->pfnSetPowerOption( cmData->cmHalState, task->GetPowerOption() ) );

     CM_CHK_MOSSTATUS_GOTOFINISH_CMERROR(
         ExecuteGroupTask(
             cmData->cmHalState,
             &param,
             static_cast<MOS_GPU_CONTEXT>(m_queueOption.GPUContext)));

     if( param.taskIdOut < 0 )
     {
         CM_ASSERTMESSAGE("Error: Invalid task ID.");
         hr = CM_FAILURE;
         goto finish;
     }
     TASK_LOG(task);
     task->GetTaskEvent( event );
     CM_CHK_NULL_GOTOFINISH_CMERROR( event );
     CM_CHK_CMSTATUS_GOTOFINISH(event->SetTaskDriverId( param.taskIdOut ));
     CM_CHK_CMSTATUS_GOTOFINISH(event->SetTaskOsData( param.osData ));
     CM_CHK_CMSTATUS_GOTOFINISH(task->ResetKernelDataStatus());

     //GT-PIN
     if(this->m_device->CheckGTPinEnabled())
     {
         CM_CHK_CMSTATUS_GOTOFINISH(event->SetSurfaceDetails(param.surEntryInfoArrays));
     }

 finish:
     MosSafeDeleteArray( param.kernels );
     MosSafeDeleteArray( param.kernelSizes );
     MosSafeDeleteArray( param.kernelCurbeOffset);

     return hr;
 }

 //*-----------------------------------------------------------------------------
 //! Flush a VEBOX task to HAL CM layer for execution.
 //! This is a non-blocking call. i.e. it returs immediately without waiting for
 //! GPU to finish the execution of tasks.
 //! INPUT: task -- Pointer to CmTaskInternal object
 //! OUTPUT:
 //!     CM_SUCCESS if all tasks in the queue are submitted
 //!     CM_FAILURE otherwise.
 //*-----------------------------------------------------------------------------
 int32_t CmQueueRT::FlushVeboxTask(CmTaskInternal* task)
 {
     CM_RETURN_CODE  hr          = CM_SUCCESS;

     CM_HAL_EXEC_VEBOX_TASK_PARAM param;
     PCM_CONTEXT_DATA cmData    = nullptr;
     CmEventRT * event          = nullptr;
     uint8_t *stateData           = nullptr;
     uint8_t *surfaceData         = nullptr;
     CmBuffer_RT * temp          = nullptr;

     CmSafeMemSet( &param, 0, sizeof( CM_HAL_EXEC_VEBOX_TASK_PARAM ) );

     //Set VEBOX state data pointer and size
     //Set VEBOX surface data pointer and size
     CM_VEBOX_STATE cmVeboxState;
     CmBufferUP *veboxParamBuf = nullptr;
     CM_VEBOX_SURFACE_DATA cmVeboxSurfaceData;
     task->GetVeboxState(cmVeboxState);
     task->GetVeboxParam(veboxParamBuf);
     task->GetVeboxSurfaceData(cmVeboxSurfaceData);
     CM_CHK_NULL_GOTOFINISH_CMERROR(veboxParamBuf);

     temp = static_cast<CmBuffer_RT*>(veboxParamBuf);
     temp->GetHandle(param.veboxParamIndex);

     param.cmVeboxState = cmVeboxState;
     param.veboxParam = veboxParamBuf;

     param.veboxSurfaceData = cmVeboxSurfaceData;

     param.queueOption = m_queueOption;

     //Set VEBOX task id to -1
     param.taskIdOut = -1;

     cmData = (PCM_CONTEXT_DATA)m_device->GetAccelData();

     cmData->cmHalState->osInterface->pfnSetGpuContext(cmData->cmHalState->osInterface, (MOS_GPU_CONTEXT)m_queueOption.GPUContext);
     RegisterSyncEvent();

     CM_CHK_MOSSTATUS_GOTOFINISH_CMERROR( cmData->cmHalState->pfnExecuteVeboxTask( cmData->cmHalState, &param ) );

     if( param.taskIdOut < 0 )
     {
         CM_ASSERTMESSAGE("Error: Invalid task ID.");
         hr = CM_FAILURE;
         goto finish;
     }

     task->GetTaskEvent( event );
     CM_CHK_NULL_GOTOFINISH_CMERROR( event );
     CM_CHK_CMSTATUS_GOTOFINISH(event->SetTaskDriverId( param.taskIdOut ));
     CM_CHK_CMSTATUS_GOTOFINISH(event->SetTaskOsData( param.osData ));

 finish:
     return hr;
 }

 //*-----------------------------------------------------------------------------
 //! Flush the queue, i.e. submit all tasks in the queue to execute according
 //! to their order in the the queue. The queue will be empty after flush,
 //! This is a non-blocking call. i.e. it returns immediately without waiting for
 //! GPU to finish the execution of tasks.
 //! INPUT:
 //! OUTPUT:
 //!     CM_SUCCESS if all tasks in the queue are submitted
 //!     CM_FAILURE otherwise.
 //*-----------------------------------------------------------------------------
 int32_t CmQueueRT::FlushEnqueueWithHintsTask( CmTaskInternal* task )
 {
     CM_RETURN_CODE               hr             = CM_SUCCESS;
     CM_HAL_EXEC_HINTS_TASK_PARAM param;
     PCM_CONTEXT_DATA             cmData        = nullptr;
     CmKernelData*                kernelData    = nullptr;
     uint32_t                     kernelDataSize = 0;
     uint32_t                     count          = 0;
     CmEventRT                    *event        = nullptr;
     PCM_HAL_KERNEL_PARAM         tempData      = nullptr;

     CmSafeMemSet( &param, 0, sizeof( CM_HAL_EXEC_HINTS_TASK_PARAM ) );

     task->GetKernelCount ( count );
     param.numKernels = count;

     param.kernels = MOS_NewArray(PCM_HAL_KERNEL_PARAM, count);
     param.kernelSizes = MOS_NewArray(uint32_t, count);
     param.kernelCurbeOffset = MOS_NewArray(uint32_t, count);
     param.queueOption = m_queueOption;

     CM_CHK_NULL_GOTOFINISH_CMERROR(param.kernels);
     CM_CHK_NULL_GOTOFINISH_CMERROR(param.kernelSizes);
     CM_CHK_NULL_GOTOFINISH_CMERROR(param.kernelCurbeOffset);

     task->GetHints(param.hints);
     task->GetNumTasksGenerated(param.numTasksGenerated);
     task->GetLastTask(param.isLastTask);

     for( uint32_t i = 0; i < count; i ++ )
     {
         task->GetKernelData( i, kernelData );
         CM_CHK_NULL_GOTOFINISH_CMERROR( kernelData );

         task->GetKernelDataSize( i, kernelDataSize );
         if( kernelDataSize == 0 )
         {
             CM_ASSERTMESSAGE("Error: Invalid kernel data size.");
             hr = CM_FAILURE;
             goto finish;
         }

         tempData = kernelData->GetHalCmKernelData();

         param.kernels[ i ]             = tempData;
         param.kernelSizes[ i ]         = kernelDataSize;
         param.kernelCurbeOffset[ i ]   = task->GetKernelCurbeOffset(i);
     }

     param.userDefinedMediaState = task->GetMediaStatePtr();
     cmData = (PCM_CONTEXT_DATA)m_device->GetAccelData();
     CM_CHK_NULL_GOTOFINISH_CMERROR(cmData);

     CM_CHK_MOSSTATUS_GOTOFINISH_CMERROR(cmData->cmHalState->pfnSetPowerOption(cmData->cmHalState, task->GetPowerOption()));

     cmData->cmHalState->osInterface->pfnSetGpuContext(cmData->cmHalState->osInterface, (MOS_GPU_CONTEXT)m_queueOption.GPUContext);
     RegisterSyncEvent();

     CM_CHK_MOSSTATUS_GOTOFINISH_CMERROR(cmData->cmHalState->pfnExecuteHintsTask(cmData->cmHalState, &param));

     if( param.taskIdOut < 0 )
     {
         CM_ASSERTMESSAGE("Error: Invalid task ID.");
         hr = CM_FAILURE;
         goto finish;
     }

     TASK_LOG(task);

     task->GetTaskEvent( event );
     CM_CHK_NULL_GOTOFINISH_CMERROR( event );
     CM_CHK_CMSTATUS_GOTOFINISH(event->SetTaskDriverId( param.taskIdOut ));
     CM_CHK_CMSTATUS_GOTOFINISH(event->SetTaskOsData( param.osData ));
     CM_CHK_CMSTATUS_GOTOFINISH(task->ResetKernelDataStatus());

 finish:

     MosSafeDeleteArray( param.kernels );
     MosSafeDeleteArray( param.kernelSizes );
     MosSafeDeleteArray( param.kernelCurbeOffset );

     return hr;
 }

 //*-----------------------------------------------------------------------------
 //! Flush the queue, i.e. submit all tasks in the queue to execute according
 //! to their order in the the queue. The queue will be empty after flush,
 //! This is a non-blocking call. i.e. it returs immediately without waiting for
 //! GPU to finish the execution of tasks.
 //! INPUT:
 //! OUTPUT:
 //!     CM_SUCCESS if all tasks in the queue are submitted
 //!     CM_FAILURE otherwise.
 //*-----------------------------------------------------------------------------
 int32_t CmQueueRT::FlushTaskWithoutSync( bool flushBlocked )
 {
     int32_t             hr          = CM_SUCCESS;
     CmTaskInternal*     task       = nullptr;
     uint32_t            taskType  = CM_TASK_TYPE_DEFAULT;
     uint32_t            freeSurfNum = 0;
     CmSurfaceManager*   surfaceMgr = nullptr;
     CSync*              surfaceLock = nullptr;

     m_criticalSectionHalExecute.Acquire(); // Enter HalCm Execute Protection

     while( !m_enqueuedTasks.IsEmpty() )
     {
         uint32_t flushedTaskCount = m_flushedTasks.GetCount();
         if ( flushBlocked )
         {
             while( flushedTaskCount >= m_halMaxValues->maxTasks )
             {
                 // If the task count in flushed queue is no less than hw restrictiion,
                 // query the staus of flushed task queue. Remove any finished tasks from the queue
                 QueryFlushedTasks();
                 flushedTaskCount = m_flushedTasks.GetCount();
             }
         }
         else
         {
             if( flushedTaskCount >= m_halMaxValues->maxTasks )
             {
                 // If the task count in flushed queue is no less than hw restrictiion,
                 // query the staus of flushed task queue. Remove any finished tasks from the queue
                 QueryFlushedTasks();
                 flushedTaskCount = m_flushedTasks.GetCount();
                 if( flushedTaskCount >= m_halMaxValues->maxTasks )
                 {
                     // If none of flushed tasks finishes, we can't flush more taks.
                     break;
                 }
             }
         }

         task = (CmTaskInternal*)m_enqueuedTasks.Pop();
         CM_CHK_NULL_GOTOFINISH_CMERROR( task );

         CmNotifierGroup *notifiers = m_device->GetNotifiers();
         if (notifiers != nullptr)
         {
             notifiers->NotifyTaskFlushed(m_device, task);
         }

         task->GetTaskType(taskType);

         switch(taskType)
         {
             case CM_INTERNAL_TASK_WITH_THREADSPACE:
                 hr = FlushGeneralTask(task);
                 break;

             case CM_INTERNAL_TASK_WITH_THREADGROUPSPACE:
                 hr = FlushGroupTask(task);
                 break;

             case CM_INTERNAL_TASK_VEBOX:
                 hr = FlushVeboxTask(task);
                 break;

             case CM_INTERNAL_TASK_ENQUEUEWITHHINTS:
                 hr = FlushEnqueueWithHintsTask(task);
                 break;

             default:    // by default, assume the task is considered as general task: CM_INTERNAL_TASK_WITH_THREADSPACE
                 hr = FlushGeneralTask(task);
                 break;
         }

         if(hr == CM_SUCCESS)
         {
             m_flushedTasks.Push( task );
             task->VtuneSetFlushTime(); // Record Flush Time
         }
         else
         {
             // Failed to flush, destroy the task.
             CmTaskInternal::Destroy( task );
         }

     } // loop for task

     QueryFlushedTasks();

 finish:
     m_criticalSectionHalExecute.Release();//Leave HalCm Execute Protection

     //Delayed destroy for resource
     m_device->GetSurfaceManager(surfaceMgr);
     if (!surfaceMgr)
     {
         CM_ASSERTMESSAGE("Error: Pointer to surface manager is null.");
         return CM_NULL_POINTER;
     }

     surfaceLock = m_device->GetSurfaceCreationLock();
     if (surfaceLock == nullptr)
     {
         CM_ASSERTMESSAGE("Error: Pointer to surface creation lock is null.");
         return CM_NULL_POINTER;
     }
     surfaceLock->Acquire();
     surfaceMgr->RefreshDelayDestroySurfaces(freeSurfNum);
     surfaceLock->Release();

     return hr;
 }

 //*-----------------------------------------------------------------------------
 //| Purpose:    Enqueue a Vebox Task
 //| Arguments :
 //|               pVebox_G75      [in]       Pointer to a CmVebox object
 //|               event          [in]       Reference to the pointer to Event
 //|
 //| Returns:    Result of the operation.
 //*-----------------------------------------------------------------------------
 CM_RT_API int32_t CmQueueRT::EnqueueVebox(CmVebox * vebox, CmEvent* & event)
 {
     INSERT_API_CALL_LOG();

     int32_t hr                  = CM_SUCCESS;
     CmTaskInternal* task   = nullptr;
     int32_t taskDriverId        = -1;
     bool isEventVisible    = (event == CM_NO_EVENT)? false:true;
     CmEventRT *eventRT = static_cast<CmEventRT *>(event);

     //Check if the input is valid
     if ( vebox == nullptr )
     {
         CM_ASSERTMESSAGE("Error: Pointer to vebox is null.");
         return CM_NULL_POINTER;
     }
     CmVeboxRT *veboxRT = static_cast<CmVeboxRT *>(vebox);
     CM_CHK_CMSTATUS_GOTOFINISH(CmTaskInternal::Create(m_device,  veboxRT, task ));

     LARGE_INTEGER nEnqueueTime;
     if ( !(MOS_QueryPerformanceCounter( (uint64_t*)&nEnqueueTime.QuadPart )) )
     {
         CM_ASSERTMESSAGE("Error: Query Performance counter failure.");
         hr = CM_FAILURE;
         goto finish;
     }

     CM_CHK_CMSTATUS_GOTOFINISH(CreateEvent(task, isEventVisible, taskDriverId, eventRT));

     if ( eventRT != nullptr )
     {
         eventRT->SetEnqueueTime( nEnqueueTime );
     }
     event = eventRT;

     if (!m_enqueuedTasks.Push(task))
     {
         CM_ASSERTMESSAGE("Error: Push enqueued tasks failure.")
         hr = CM_FAILURE;
         goto finish;
     }

     CM_CHK_CMSTATUS_GOTOFINISH(FlushTaskWithoutSync());

 finish:
     if (hr != CM_SUCCESS)
     {
         CmTaskInternal::Destroy(task);
     }
     return hr;
 }

 //*-----------------------------------------------------------------------------
 //| Purpose:   Create Event and Update event in m_eventArray
 //| Returns:   result of operation
 //*-----------------------------------------------------------------------------
 int32_t CmQueueRT::CreateEvent(CmTaskInternal *task, bool isVisible, int32_t &taskDriverId, CmEventRT *&event )
 {
     int32_t hr = CM_SUCCESS;

     m_criticalSectionEvent.Acquire();

     uint32_t freeSlotInEventArray = m_eventArray.GetFirstFreeIndex();

     hr = CmEventRT::Create( freeSlotInEventArray, this, task, taskDriverId, m_device, isVisible, event );

     if (hr == CM_SUCCESS)
     {

         m_eventArray.SetElement( freeSlotInEventArray, event );
         m_eventCount ++;

         task->SetTaskEvent( event );

         if(!isVisible)
         {
             event = nullptr;
         }

     }
     else
     {
         CM_ASSERTMESSAGE("Error: Create Event failure.")
     }

     m_criticalSectionEvent.Release();

     return hr;
 }

 //*---------------------------------------------------------------------------------------------------------
 //| Name:       EnqueueCopyCPUToGPUFullStride()
 //| Purpose:    Copy data from system memory to video memory (surface)
 //| Arguments:
 //|             surface      [in]  Pointer to a CmSurface2D object as copy destination
 //|             sysMem       [in]  Pointer to a system memory as copy source
 //|             widthStride   [in]  Width stride in bytes for system memory (to calculate start of next line)
 //|             heightStride  [in]  Width stride in row for system memory (to calculate start of next plane)
 //|             option        [in]  Option passed from user, blocking copy, non-blocking copy or disable turbo boost
 //|             event        [in,out]  Reference to the pointer to Event
 //| Returns:    Result of the operation.
 //|
 //| Restrictions & Notes:
 //|             1) sysMem must be 16-byte aligned.
 //|             2) Surface's width must be 16-byte aligned regarding performance.
 //|             3) widthStride and heightStride are used to indicate the padding information in system memory
 //|                 widthStride = width_in_pixel * bytes_per_pixel + padding_in_bytes
 //|                 heightStride = height + padding_in_row
 //*---------------------------------------------------------------------------------------------------------
 CM_RT_API int32_t CmQueueRT::EnqueueCopyCPUToGPUFullStride( CmSurface2D* surface,
                                                      const unsigned char* sysMem,
                                                      const uint32_t widthStride,
                                                      const uint32_t heightStride,
                                                      const uint32_t option,
                                                      CmEvent* & event )
 {
     INSERT_API_CALL_LOG();

     if (!m_device->HasGpuCopyKernel())
     {
         return CM_NOT_IMPLEMENTED;
     }

     CmSurface2DRT *surfaceRT = static_cast<CmSurface2DRT *>(surface);
     return EnqueueCopyInternal(surfaceRT, (unsigned char*)sysMem, widthStride, heightStride, CM_FASTCOPY_CPU2GPU, option, event);
 }

 //*---------------------------------------------------------------------------------------------------------
 //| Name:       EnqueueCopyGPUToCPUFullStride()
 //| Purpose:    Copy data from tiled video memory (surface) to linear system memory
 //| Arguments:
 //|             surface      [in]  Pointer to a CmSurface2D object as copy source
 //|             sysMem       [in]  Pointer to a system memory as copy destination
 //|             widthStride   [in]  Width stride in bytes for system memory (to calculate start of next line)
 //|             heightStride  [in]  Width stride in row for system memory (to calculate start of next plane)
 //|             option        [in]  Option passed from user, blocking copy,non-blocking copy or disable turbo boost
 //|             event        [in,out]  Reference to the pointer to Event
 //| Returns:    Result of the operation.
 //|
 //| Restrictions & Notes:
 //|             1) sysMem must be 16-byte aligned.
 //|             2) Surface's width must be 16-byte aligned regarding performance.
 //|             3) widthStride and heightStride are used to indicate the padding information in system memory
 //|                 widthStride = width_in_pixel * bytes_per_pixel + padding_in_bytes
 //|                 heightStride = height + padding_in_row
 //*---------------------------------------------------------------------------------------------------------
 CM_RT_API int32_t CmQueueRT::EnqueueCopyGPUToCPUFullStride( CmSurface2D* surface,
                                                      unsigned char* sysMem,
                                                      const uint32_t widthStride,
                                                      const uint32_t heightStride,
                                                      const uint32_t option,
                                                      CmEvent* & event )
 {
     INSERT_API_CALL_LOG();

     if (!m_device->HasGpuCopyKernel())
     {
         return CM_NOT_IMPLEMENTED;
     }

     CmSurface2DRT *surfaceRT = static_cast<CmSurface2DRT *>(surface);
     return EnqueueCopyInternal(surfaceRT, sysMem, widthStride, heightStride, CM_FASTCOPY_GPU2CPU, option, event);
 }

 //*---------------------------------------------------------------------------------------------------------
 //| Name:       CreateGPUCopyKernel()
 //| Purpose:    Create GPUCopy kernel, reuse the kernel if it has been created and resuable
 //| Arguments:
 //|             widthInByte      [in]  surface's width in bytes
 //|             height           [in]  surface's height
 //|             format           [in]  surface's height
 //|             copyDirection    [in]  copy direction, cpu -> gpu or gpu -> cpu
 //|             gpuCopyKernelParam [out] kernel param
 //|
 //| Returns:    Result of the operation.
 //|
 //*---------------------------------------------------------------------------------------------------------
 int32_t CmQueueRT::CreateGPUCopyKernel(uint32_t widthInByte,
                                        uint32_t height,
                                        CM_SURFACE_FORMAT format,
                                        CM_GPUCOPY_DIRECTION copyDirection,
                                        CM_GPUCOPY_KERNEL* &gpuCopyKernelParam)
 {
     int32_t     hr                 = CM_SUCCESS;

     //Search existing kernel
     CM_CHK_CMSTATUS_GOTOFINISH(SearchGPUCopyKernel(widthInByte, height, format, copyDirection, gpuCopyKernelParam));

     if(gpuCopyKernelParam != nullptr)
     { // reuse
         GPUCOPY_KERNEL_LOCK(gpuCopyKernelParam);
     }
     else
     {
         gpuCopyKernelParam   = new (std::nothrow) CM_GPUCOPY_KERNEL ;
         CM_CHK_NULL_GOTOFINISH_CMERROR(gpuCopyKernelParam);
         CmSafeMemSet(gpuCopyKernelParam, 0, sizeof(CM_GPUCOPY_KERNEL));

         CM_CHK_CMSTATUS_GOTOFINISH(AllocateGPUCopyKernel(widthInByte, height, format, copyDirection, gpuCopyKernelParam->kernel));
         CM_CHK_CMSTATUS_GOTOFINISH(GetGPUCopyKrnID(widthInByte, height, format, copyDirection, gpuCopyKernelParam->kernelID));
         GPUCOPY_KERNEL_LOCK(gpuCopyKernelParam);

         CM_CHK_CMSTATUS_GOTOFINISH(AddGPUCopyKernel(gpuCopyKernelParam));
     }

 finish:
     if( hr != CM_SUCCESS)
     {
         CmSafeDelete(gpuCopyKernelParam);
     }

     return hr;
 }

 //*---------------------------------------------------------------------------------------------------------
 //| Name:       SearchGPUCopyKernel()
 //| Purpose:    Search if the required kernel exists
 //| Arguments:
 //|             widthInByte      [in]  surface's width in bytes
 //|             height           [in]  surface's height
 //|             format           [in]  surface's height
 //|             copyDirection    [in]  copy direction, cpu -> gpu or gpu -> cpu
 //|             gpuCopyKernelParam [out] kernel param
 //|
 //| Returns:    Result of the operation.
 //|
 //*---------------------------------------------------------------------------------------------------------
 int32_t CmQueueRT::SearchGPUCopyKernel(uint32_t widthInByte,
                                        uint32_t height,
                                        CM_SURFACE_FORMAT format,
                                        CM_GPUCOPY_DIRECTION copyDirection,
                                        CM_GPUCOPY_KERNEL* &kernelParam)
 {
     int32_t     hr = CM_SUCCESS;
     CM_GPUCOPY_KERNEL *gpucopyKernel = nullptr;
     CM_GPUCOPY_KERNEL_ID kernelTypeID = GPU_COPY_KERNEL_UNKNOWN;

     kernelParam = nullptr;
     CM_CHK_CMSTATUS_GOTOFINISH(GetGPUCopyKrnID(widthInByte, height, format, copyDirection, kernelTypeID));

     for(uint32_t index =0 ;  index< m_copyKernelParamArrayCount; index++)
     {
         gpucopyKernel = (CM_GPUCOPY_KERNEL*)m_copyKernelParamArray.GetElement(index);
         if(gpucopyKernel != nullptr)
         {
             if(!gpucopyKernel->locked &&
                gpucopyKernel->kernelID == kernelTypeID)
             {
                 kernelParam = gpucopyKernel;
                 break;
             }
         }
     }

 finish:
     return hr;
 }

 //*---------------------------------------------------------------------------------------------------------
 //| Name:       AddGPUCopyKernel()
 //| Purpose:    Add new kernel into m_copyKernelParamArray
 //| Arguments:
 //|             widthInByte      [in]  surface's width in bytes
 //|             height           [in]  surface's height
 //|             format           [in]  surface's height
 //|             copyDirection    [in]  copy direction, cpu -> gpu or gpu -> cpu
 //|             gpuCopyKernelParam [out] kernel param
 //|
 //| Returns:    Result of the operation.
 //|
 //*---------------------------------------------------------------------------------------------------------
 int32_t CmQueueRT::AddGPUCopyKernel(CM_GPUCOPY_KERNEL* &kernelParam)
 {
     int32_t hr = CM_SUCCESS;
     // critical section protection
     CLock locker(m_criticalSectionGPUCopyKrn);

     CM_CHK_NULL_GOTOFINISH(kernelParam, CM_INVALID_GPUCOPY_KERNEL);

     // the newly created kernel must be locked
     if(!kernelParam->locked)
     {
         CM_ASSERTMESSAGE("Error: The newly created kernel must be locked.")
         hr = CM_INVALID_GPUCOPY_KERNEL;
         goto finish;
     }

     m_copyKernelParamArray.SetElement(m_copyKernelParamArrayCount, kernelParam);
     m_copyKernelParamArrayCount ++;

 finish:
     return hr;
 }

 //*---------------------------------------------------------------------------------------------------------
 //| Name:       GetGPUCopyKrnID()
 //| Purpose:    Calculate the kernel ID accroding surface's width, height and copy direction
 //| Arguments:
 //|             widthInByte      [in]  surface's width in bytes
 //|             height           [in]  surface's height
 //|             format           [in]  surface's height
 //|             copyDirection    [in]  copy direction, cpu -> gpu or gpu -> cpu
 //|             kernelID         [out] kernel id
 //|
 //| Returns:    Result of the operation.
 //|
 //*---------------------------------------------------------------------------------------------------------
 int32_t CmQueueRT::GetGPUCopyKrnID( uint32_t widthInByte, uint32_t height, CM_SURFACE_FORMAT format,
             CM_GPUCOPY_DIRECTION copyDirection, CM_GPUCOPY_KERNEL_ID &kernelID )
 {
     int32_t hr = CM_SUCCESS;

     kernelID = GPU_COPY_KERNEL_UNKNOWN;

     if (format == CM_SURFACE_FORMAT_NV12 || format == CM_SURFACE_FORMAT_P010 || format == CM_SURFACE_FORMAT_P016)
     {
         switch(copyDirection)
         {
             case CM_FASTCOPY_GPU2CPU:
                 if ( (height&0x7) ||(widthInByte&0x7f))
                 {
                     kernelID = GPU_COPY_KERNEL_GPU2CPU_UNALIGNED_NV12_ID ;
                 }
                 else
                 {   // height 8-row aligned, widthByte 128 multiple
                     kernelID = GPU_COPY_KERNEL_GPU2CPU_ALIGNED_NV12_ID ;
                 }
                 break;

             case CM_FASTCOPY_CPU2GPU:
                 kernelID = GPU_COPY_KERNEL_CPU2GPU_NV12_ID;
                 break;

             case CM_FASTCOPY_GPU2GPU:
                 kernelID = GPU_COPY_KERNEL_GPU2GPU_NV12_ID;
                 break;

             case CM_FASTCOPY_CPU2CPU:
                 kernelID = GPU_COPY_KERNEL_CPU2CPU_ID;
                 break;

             default :
                 CM_ASSERTMESSAGE("Error: Invalid fast copy direction.")
                 hr = CM_FAILURE;
                 break;
         }
     }
     else
     {
         switch(copyDirection)
         {
             case CM_FASTCOPY_GPU2CPU:
                 if ( (height&0x7) ||(widthInByte&0x7f))
                 {
                     kernelID = GPU_COPY_KERNEL_GPU2CPU_UNALIGNED_ID;
                 }
                 else
                 {   // height 8-row aligned, widthByte 128 multiple
                     kernelID = GPU_COPY_KERNEL_GPU2CPU_ALIGNED_ID;
                 }
                 break;

             case CM_FASTCOPY_CPU2GPU:
                 kernelID = GPU_COPY_KERNEL_CPU2GPU_ID;
                 break;

             case CM_FASTCOPY_GPU2GPU:
                 kernelID = GPU_COPY_KERNEL_GPU2GPU_ID;
                 break;

             case CM_FASTCOPY_CPU2CPU:
                 kernelID = GPU_COPY_KERNEL_CPU2CPU_ID;
                 break;

             default :
                 CM_ASSERTMESSAGE("Error: Invalid fast copy direction.")
                 hr = CM_FAILURE;
                 break;
         }
     }

     return hr;
 }

 //*---------------------------------------------------------------------------------------------------------
 //| Name:       AllocateGPUCopyKernel()
 //| Purpose:    Allocate GPUCopy Kernel
 //| Arguments:
 //|             widthInByte      [in]  surface's width in bytes
 //|             height           [in]  surface's height
 //|             format           [in]  surface's height
 //|             copyDirection    [in]  copy direction, cpu -> gpu or gpu -> cpu
 //|             kernel          [out] pointer to created kernel
 //|
 //| Returns:    Result of the operation.
 //|
 //*---------------------------------------------------------------------------------------------------------
 int32_t CmQueueRT::AllocateGPUCopyKernel( uint32_t widthInByte, uint32_t height, CM_SURFACE_FORMAT format,
             CM_GPUCOPY_DIRECTION copyDirection, CmKernel *&kernel )
 {
     int32_t          hr                 = CM_SUCCESS;
     CmProgram       *gpuCopyProgram    = nullptr;

     CM_CHK_CMSTATUS_GOTOFINISH( m_device->LoadPredefinedCopyKernel(gpuCopyProgram));
     CM_CHK_NULL_GOTOFINISH_CMERROR(gpuCopyProgram);

     if (format == CM_SURFACE_FORMAT_NV12 || format == CM_SURFACE_FORMAT_P010 || format == CM_SURFACE_FORMAT_P016)
     {
         switch(copyDirection)
         {
             case CM_FASTCOPY_GPU2CPU:
                 if ( (height&0x7) ||(widthInByte&0x7f))
                 {
                     CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateKernel( gpuCopyProgram, _NAME( surfaceCopy_read_NV12_32x32 ) , kernel,"PredefinedGPUCopyKernel"));
                 }
                 else
                 {   // height 8-row aligned, widthByte 128 multiple
                     CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateKernel( gpuCopyProgram, _NAME( surfaceCopy_read_NV12_aligned_32x32 ) , kernel,"PredefinedGPUCopyKernel"));
                 }
                 break;

             case CM_FASTCOPY_CPU2GPU:
                 CM_CHK_CMSTATUS_GOTOFINISH( m_device->CreateKernel( gpuCopyProgram, _NAME( surfaceCopy_write_NV12_32x32 ), kernel, "PredefinedGPUCopyKernel"));
                 break;

             case CM_FASTCOPY_GPU2GPU:
                 CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateKernel(gpuCopyProgram, _NAME(SurfaceCopy_2DTo2D_NV12_32x32), kernel, "PredefinedGPUCopyKernel"));
                 break;

             case CM_FASTCOPY_CPU2CPU:
                 CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateKernel(gpuCopyProgram, _NAME(SurfaceCopy_BufferToBuffer_4k), kernel, "PredefinedGPUCopyKernel"));
                 break;

             default :
                 CM_ASSERTMESSAGE("Error: Invalid fast copy direction.")
                 hr = CM_FAILURE;
                 break;
         }
     }
     else
     {
         switch(copyDirection)
         {
             case CM_FASTCOPY_GPU2CPU:
                 if ( (height&0x7) ||(widthInByte&0x7f))
                 {
                     CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateKernel( gpuCopyProgram, _NAME( surfaceCopy_read_32x32 ) , kernel, "PredefinedGPUCopyKernel"));
                 }
                 else
                 {   // height 8-row aligned, widthByte 128 multiple
                     CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateKernel( gpuCopyProgram, _NAME( surfaceCopy_read_aligned_32x32  ) , kernel, "PredefinedGPUCopyKernel"));
                 }
                 break;

             case CM_FASTCOPY_CPU2GPU:
                 CM_CHK_CMSTATUS_GOTOFINISH( m_device->CreateKernel( gpuCopyProgram, _NAME( surfaceCopy_write_32x32 ), kernel, "PredefinedGPUCopyKernel" ));
                 break;

             case CM_FASTCOPY_GPU2GPU:
                 CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateKernel(gpuCopyProgram, _NAME(SurfaceCopy_2DTo2D_32x32), kernel, "PredefinedGPUCopyKernel"));
                 break;

             case CM_FASTCOPY_CPU2CPU:
                 CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateKernel(gpuCopyProgram, _NAME(SurfaceCopy_BufferToBuffer_4k), kernel, "PredefinedGPUCopyKernel"));
                 break;

             default :
                 CM_ASSERTMESSAGE("Error: Invalid fast copy direction.")
                 hr = CM_FAILURE;
                 break;
         }
     }

 finish:
     return hr;
 }

 CM_RT_API int32_t CmQueueRT::EnqueueFast(CmTask *task,
                                          CmEvent* &event,
                                          const CmThreadSpace *threadSpace)
 {
     CM_HAL_STATE * state = ((PCM_CONTEXT_DATA)m_device->GetAccelData())->cmHalState;
     int32_t result = CM_SUCCESS;
     if (state == nullptr || state->advExecutor == nullptr)
     {
         result = CM_NULL_POINTER;
     }
     else
     {
         const CmThreadSpaceRT *threadSpaceRTConst
                 = static_cast<const CmThreadSpaceRT*>(threadSpace);
         if (state->cmHalInterface->CheckMediaModeAvailability() == false)
         {
             uint32_t old_stream_idx = state->osInterface->streamIndex;
             state->osInterface->streamIndex = m_streamIndex;
             if (threadSpaceRTConst != nullptr)
             {
                 result = state->advExecutor->SubmitComputeTask(
                     this, task, event,
                     threadSpaceRTConst->GetThreadGroupSpace(),
                     (MOS_GPU_CONTEXT)m_queueOption.GPUContext);
             }
             else
             {
                 result = state->advExecutor->SubmitComputeTask(
                     this, task, event, nullptr,
                     (MOS_GPU_CONTEXT)m_queueOption.GPUContext);
             }
             state->osInterface->streamIndex = old_stream_idx;
         }
         else
         {
             result = state->advExecutor->SubmitTask(
                 this, task, event, threadSpace,
                 (MOS_GPU_CONTEXT)m_queueOption.GPUContext);
         }
     }
     return result;
 }

 CM_RT_API int32_t CmQueueRT::DestroyEventFast(CmEvent *&event)
 {
     CM_HAL_STATE * state = ((PCM_CONTEXT_DATA)m_device->GetAccelData())->cmHalState;

     if (state == nullptr || state->advExecutor == nullptr)
     {
         return CM_NULL_POINTER;
     }
     else
     {
         return state->advExecutor->DestoryEvent(this, event);
     }
 }

 CM_RT_API int32_t
 CmQueueRT::EnqueueWithGroupFast(CmTask *task,
                                 CmEvent* &event,
                                 const CmThreadGroupSpace *threadGroupSpace)
 {
     CM_HAL_STATE * state = ((PCM_CONTEXT_DATA)m_device->GetAccelData())->cmHalState;
     int32_t result = CM_SUCCESS;
     if (state == nullptr || state->advExecutor == nullptr)
     {
         return CM_NULL_POINTER;
     }

     uint32_t old_stream_idx = state->osInterface->streamIndex;
     state->osInterface->streamIndex = m_streamIndex;
     result = state->advExecutor->SubmitComputeTask(
         this, task, event, threadGroupSpace,
         (MOS_GPU_CONTEXT)m_queueOption.GPUContext);
     state->osInterface->streamIndex = old_stream_idx;
     return result;
 }

 int32_t CmQueueRT::GetOSSyncEventHandle(void *& hOSSyncEvent)
 {
     hOSSyncEvent = m_osSyncEvent;
     return CM_SUCCESS;
 }


 int32_t CmQueueRT::RegisterSyncEvent()
 {
     CM_RETURN_CODE  hr = CM_SUCCESS;

     CM_HAL_OSSYNC_PARAM syncParam;
     void *syncEventHandle = nullptr;
     syncParam.osSyncEvent = syncEventHandle;

     PCM_CONTEXT_DATA  cmData = (PCM_CONTEXT_DATA)m_device->GetAccelData();
     PCM_HAL_STATE  cmHalState = cmData->cmHalState;
     // Call HAL layer to wait for Task finished with event-driven mechanism
     CM_CHK_MOSSTATUS_GOTOFINISH_CMERROR(cmHalState->pfnRegisterUMDNotifyEventHandle(cmHalState, &syncParam));

     m_osSyncEvent = syncParam.osSyncEvent;

 finish:
     return hr;
 }

 MOS_STATUS CmQueueRT::CreateGpuContext(CM_HAL_STATE *halState,
                                        MOS_GPU_CONTEXT gpuContextName,
                                        MOS_GPU_NODE gpuNode,
                                        MOS_GPUCTX_CREATOPTIONS *createOptions)
 {
     uint32_t old_stream_idx = 0;
     if (MOS_GPU_CONTEXT_CM_COMPUTE == gpuContextName)
     {
         m_streamIndex = halState->pfnRegisterStream(halState);
         old_stream_idx = halState->osInterface->streamIndex;
         halState->osInterface->streamIndex = m_streamIndex;
     }
     else
     {  // As there is only one render context, the original stream index will be used.
         old_stream_idx = m_streamIndex = halState->osInterface->streamIndex;
     }
     MOS_STATUS status = halState->pfnCreateGPUContext(halState,
                                                       gpuContextName, gpuNode,
                                                       createOptions);
     halState->osInterface->streamIndex = old_stream_idx;
     return status;
 }

 MOS_STATUS CmQueueRT::ExecuteGroupTask(CM_HAL_STATE *halState,
                                        CM_HAL_EXEC_TASK_GROUP_PARAM *taskParam,
                                        MOS_GPU_CONTEXT gpuContextName)
 {
     uint32_t old_stream_idx = halState->osInterface->streamIndex;
     halState->osInterface->streamIndex = m_streamIndex;
     MOS_STATUS result
             = halState->osInterface->pfnSetGpuContext(halState->osInterface,
                                                       gpuContextName);
     if (MOS_STATUS_SUCCESS != result)
     {
         return result;
     }
     RegisterSyncEvent();
     result = halState->pfnExecuteGroupTask(halState, taskParam);
     halState->osInterface->streamIndex = old_stream_idx;
     return result;
 }
 }  // namespace