/*
* Copyright (c) 2018-2021, Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
//!
//! \file    mos_gpucontext_specific.cpp
//! \brief   Container class for the Linux specific gpu context
//!

#include "mos_context_specific.h"
#include "mos_gpucontext_specific.h"
#include "mos_graphicsresource_specific.h"
#include "mos_commandbuffer_specific.h"
#include "mos_util_devult_specific.h"
#include "mos_cmdbufmgr.h"
#include "mos_os_virtualengine.h"
#include <unistd.h>

#define MI_BATCHBUFFER_END 0x05000000
static pthread_mutex_t command_dump_mutex = PTHREAD_MUTEX_INITIALIZER;

GpuContextSpecific::GpuContextSpecific(
    const MOS_GPU_NODE gpuNode,
    MOS_GPU_CONTEXT    mosGpuCtx,
    CmdBufMgr         *cmdBufMgr,
    GpuContext        *reusedContext)
{
    MOS_OS_FUNCTION_ENTER;

    m_nodeOrdinal          = gpuNode;
    m_cmdBufMgr            = cmdBufMgr;
    m_gpuContext           = mosGpuCtx;
    m_statusBufferResource = nullptr;
    m_maxPatchLocationsize = PATCHLOCATIONLIST_SIZE;

    if (reusedContext)
    {
        MOS_OS_NORMALMESSAGE("gpucontex reusing not enabled on Linux.");
    }

#if (_DEBUG || _RELEASE_INTERNAL)
    // get user engine instance setting from environment variable
    char *engineInstances = getenv("INTEL_ENGINE_INSTANCE");
    if (engineInstances != nullptr)
    {
        errno             = 0;
        long int instance = strtol(engineInstances, nullptr, 16);
        /* Check for various possible errors. */
        if ((errno == ERANGE && instance == LONG_MAX) || (instance < 0))
        {
            MOS_OS_NORMALMESSAGE("Invalid INTEL_ENGINE_INSTANCE setting.(%s)\n", engineInstances);
            m_engineInstanceSelect = 0x0;
        }
        else
        {
            m_engineInstanceSelect = (uint32_t)instance;
        }
    }
#endif
}

GpuContextSpecific::~GpuContextSpecific()
{
    MOS_OS_FUNCTION_ENTER;

    Clear();
}

MOS_STATUS GpuContextSpecific::Init(OsContext *osContext,
                    PMOS_INTERFACE osInterface,
                    MOS_GPU_NODE GpuNode,
                    PMOS_GPUCTX_CREATOPTIONS createOption)
{
    MOS_OS_FUNCTION_ENTER;

    MOS_OS_CHK_NULL_RETURN(osContext);

    if (m_cmdBufPoolMutex == nullptr)
    {
        m_cmdBufPoolMutex = MosUtilities::MosCreateMutex();
    }

    MOS_OS_CHK_NULL_RETURN(m_cmdBufPoolMutex);

    MosUtilities::MosLockMutex(m_cmdBufPoolMutex);

    m_cmdBufPool.clear();

    MosUtilities::MosUnlockMutex(m_cmdBufPoolMutex);

    m_commandBufferSize = COMMAND_BUFFER_SIZE;

    m_nextFetchIndex = 0;

    m_cmdBufFlushed = true;

    m_osContext = osContext;

    MOS_OS_CHK_STATUS_RETURN(AllocateGPUStatusBuf());

    m_commandBuffer = (PMOS_COMMAND_BUFFER)MOS_AllocAndZeroMemory(sizeof(MOS_COMMAND_BUFFER));

    MOS_OS_CHK_NULL_RETURN(m_commandBuffer);

    m_IndirectHeapSize = 0;

    // each thread has its own GPU context, so do not need any lock as guarder here
    m_allocationList = (ALLOCATION_LIST *)MOS_AllocAndZeroMemory(sizeof(ALLOCATION_LIST) * ALLOCATIONLIST_SIZE);
    MOS_OS_CHK_NULL_RETURN(m_allocationList);
    m_maxNumAllocations = ALLOCATIONLIST_SIZE;

    m_patchLocationList = (PATCHLOCATIONLIST *)MOS_AllocAndZeroMemory(sizeof(PATCHLOCATIONLIST) * PATCHLOCATIONLIST_SIZE);
    MOS_OS_CHK_NULL_RETURN(m_patchLocationList);
    m_maxPatchLocationsize = PATCHLOCATIONLIST_SIZE;

    m_attachedResources = (PMOS_RESOURCE)MOS_AllocAndZeroMemory(sizeof(MOS_RESOURCE) * ALLOCATIONLIST_SIZE);
    MOS_OS_CHK_NULL_RETURN(m_attachedResources);

    m_writeModeList = (bool *)MOS_AllocAndZeroMemory(sizeof(bool) * ALLOCATIONLIST_SIZE);
    MOS_OS_CHK_NULL_RETURN(m_writeModeList);

    m_GPUStatusTag = 1;

    m_createOptionEnhanced = (MOS_GPUCTX_CREATOPTIONS_ENHANCED*)MOS_AllocAndZeroMemory(sizeof(MOS_GPUCTX_CREATOPTIONS_ENHANCED));
    MOS_OS_CHK_NULL_RETURN(m_createOptionEnhanced);
    m_createOptionEnhanced->SSEUValue = createOption->SSEUValue;

    if (typeid(*createOption) == typeid(MOS_GPUCTX_CREATOPTIONS_ENHANCED))
    {
        PMOS_GPUCTX_CREATOPTIONS_ENHANCED createOptionEnhanced = static_cast<PMOS_GPUCTX_CREATOPTIONS_ENHANCED>(createOption);
        m_createOptionEnhanced->UsingSFC = createOptionEnhanced->UsingSFC;
    }

    for (int i=0; i<MAX_ENGINE_INSTANCE_NUM+1; i++)
    {
        m_i915Context[i] = nullptr;
    }

    if (osInterface->ctxBasedScheduling)
    {
        unsigned int nengine              = 0;
        struct i915_engine_class_instance *engine_map = nullptr;

        MOS_TraceEventExt(EVENT_GPU_CONTEXT_CREATE, EVENT_TYPE_START,
                          &GpuNode, sizeof(GpuNode), nullptr, 0);

        m_i915Context[0] = mos_gem_context_create_shared(osInterface->pOsContext->bufmgr,
                                             osInterface->pOsContext->intel_context,
                                             I915_CONTEXT_CREATE_FLAGS_SINGLE_TIMELINE);
        if (m_i915Context[0] == nullptr)
        {
            MOS_OS_ASSERTMESSAGE("Failed to create context.\n");
            return MOS_STATUS_UNKNOWN;
        }
        m_i915Context[0]->pOsContext = osInterface->pOsContext;

        m_i915ExecFlag = I915_EXEC_DEFAULT;

        if (mos_query_engines_count(osInterface->pOsContext->bufmgr, &nengine) || (nengine == 0))
        {
            MOS_OS_ASSERTMESSAGE("Failed to query engines count.\n");
            return MOS_STATUS_UNKNOWN;
        }
        engine_map = (struct i915_engine_class_instance *)MOS_AllocAndZeroMemory(nengine * sizeof(struct i915_engine_class_instance));
        MOS_OS_CHK_NULL_RETURN(engine_map);

        if (GpuNode == MOS_GPU_NODE_3D)
        {
            __u16 engine_class = I915_ENGINE_CLASS_RENDER;
            __u64 caps = 0;

            if (mos_query_engines(osInterface->pOsContext->bufmgr, engine_class, caps, &nengine, engine_map))
            {
                MOS_OS_ASSERTMESSAGE("Failed to query engines.\n");
                MOS_SafeFreeMemory(engine_map);
                return MOS_STATUS_UNKNOWN;
            }

            if (mos_set_context_param_load_balance(m_i915Context[0], engine_map, nengine))
            {
                MOS_OS_ASSERTMESSAGE("Failed to set balancer extension.\n");
                MOS_SafeFreeMemory(engine_map);
                return MOS_STATUS_UNKNOWN;
            }

            if (createOption->SSEUValue != 0)
            {
                struct drm_i915_gem_context_param_sseu sseu;
                MOS_ZeroMemory(&sseu, sizeof(sseu));
                sseu.flags = I915_CONTEXT_SSEU_FLAG_ENGINE_INDEX;
                sseu.engine.engine_instance = m_i915ExecFlag;

                if (mos_get_context_param_sseu(m_i915Context[0], &sseu))
                {
                    MOS_OS_ASSERTMESSAGE("Failed to get sseu configuration.");
                    MOS_SafeFreeMemory(engine_map);
                    return MOS_STATUS_UNKNOWN;
                }

                if (mos_hweight8(sseu.subslice_mask) > createOption->packed.SubSliceCount)
                {
                    sseu.subslice_mask = mos_switch_off_n_bits(sseu.subslice_mask,
                            mos_hweight8(sseu.subslice_mask)-createOption->packed.SubSliceCount);
                }

                if (mos_set_context_param_sseu(m_i915Context[0], sseu))
                {
                    MOS_OS_ASSERTMESSAGE("Failed to set sseu configuration.");
                    MOS_SafeFreeMemory(engine_map);
                    return MOS_STATUS_UNKNOWN;
                }
            }
        }
        else if (GpuNode == MOS_GPU_NODE_COMPUTE)
        {
            __u16 engine_class = 4; //To change later when linux define the name
            __u64 caps = 0;

            if (mos_query_engines(osInterface->pOsContext->bufmgr, engine_class, caps, &nengine, engine_map))
            {
                MOS_OS_ASSERTMESSAGE("Failed to query engines.\n");
                MOS_SafeFreeMemory(engine_map);
                return MOS_STATUS_UNKNOWN;
            }

#if (_DEBUG || _RELEASE_INTERNAL)
            SelectEngineInstanceByUser(engine_map, &nengine, m_engineInstanceSelect, GpuNode);
#endif
            if (mos_set_context_param_load_balance(m_i915Context[0], engine_map, nengine))
            {
                MOS_OS_ASSERTMESSAGE("Failed to set balancer extension.\n");
                MOS_SafeFreeMemory(engine_map);
                return MOS_STATUS_UNKNOWN;
            }
        }
        else if (GpuNode == MOS_GPU_NODE_VIDEO || GpuNode == MOS_GPU_NODE_VIDEO2
                 || GpuNode == MOS_GPU_NODE_VE)
        {
            __u16 engine_class = (GpuNode == MOS_GPU_NODE_VE)? I915_ENGINE_CLASS_VIDEO_ENHANCE : I915_ENGINE_CLASS_VIDEO;
            __u64 caps = 0;

            SetEngineQueryFlags(createOption, caps);

            if (mos_query_engines(osInterface->pOsContext->bufmgr, engine_class, caps, &nengine, engine_map))
            {
                MOS_OS_ASSERTMESSAGE("Failed to query engines.\n");
                MOS_SafeFreeMemory(engine_map);
                return MOS_STATUS_UNKNOWN;
            }

#if (_DEBUG || _RELEASE_INTERNAL)
            SelectEngineInstanceByUser(engine_map, &nengine, m_engineInstanceSelect, GpuNode);
#endif
            if (mos_set_context_param_load_balance(m_i915Context[0], engine_map, nengine))
            {
                MOS_OS_ASSERTMESSAGE("Failed to set balancer extension.\n");
                MOS_SafeFreeMemory(engine_map);
                return MOS_STATUS_UNKNOWN;
            }

            if (nengine >= 2)
            {
                if(!osInterface->bGucSubmission)
                {
                    //master queue
                    m_i915Context[1] = mos_gem_context_create_shared(osInterface->pOsContext->bufmgr,
                                                                     osInterface->pOsContext->intel_context,
                                                                     I915_CONTEXT_CREATE_FLAGS_SINGLE_TIMELINE);
                    if (m_i915Context[1] == nullptr)
                    {
                        MOS_OS_ASSERTMESSAGE("Failed to create master context.\n");
                        MOS_SafeFreeMemory(engine_map);
                        return MOS_STATUS_UNKNOWN;
                    }
                    m_i915Context[1]->pOsContext = osInterface->pOsContext;

                    if (mos_set_context_param_load_balance(m_i915Context[1], engine_map, 1))
                    {
                        MOS_OS_ASSERTMESSAGE("Failed to set master context bond extension.\n");
                        MOS_SafeFreeMemory(engine_map);
                        return MOS_STATUS_UNKNOWN;
                    }

                    //slave queue
                    for (int i=1; i<nengine; i++)
                    {
                        m_i915Context[i+1] = mos_gem_context_create_shared(osInterface->pOsContext->bufmgr,
                                                                         osInterface->pOsContext->intel_context,
                                                                         I915_CONTEXT_CREATE_FLAGS_SINGLE_TIMELINE);
                        if (m_i915Context[i+1] == nullptr)
                        {
                            MOS_OS_ASSERTMESSAGE("Failed to create slave context.\n");
                            MOS_SafeFreeMemory(engine_map);
                            return MOS_STATUS_UNKNOWN;
                        }
                        m_i915Context[i+1]->pOsContext = osInterface->pOsContext;

                        if (mos_set_context_param_bond(m_i915Context[i+1], engine_map[0], &engine_map[i], 1) != S_SUCCESS)
                        {
                            int err = errno;
                            if (err == ENODEV)
                            {
                                mos_gem_context_destroy(m_i915Context[i+1]);
                                m_i915Context[i+1] = nullptr;
                                break;
                            }
                            else
                            {
                                MOS_OS_ASSERTMESSAGE("Failed to set slave context bond extension. errno=%d\n",err);
                                MOS_SafeFreeMemory(engine_map);
                                return MOS_STATUS_UNKNOWN;
                            }
                        }
                    }
                }
                else
                {
                    //create context with different width
                    for(int i = 1; i < nengine; i++)
                    {
                        unsigned int ctxWidth = i + 1;
                        m_i915Context[i] = mos_gem_context_create_shared(osInterface->pOsContext->bufmgr,
                                                                     osInterface->pOsContext->intel_context,
                                                                     0); // I915_CONTEXT_CREATE_FLAGS_SINGLE_TIMELINE not allowed for parallel submission
                        if (mos_set_context_param_parallel(m_i915Context[i], engine_map, ctxWidth) != S_SUCCESS)
                        {
                            MOS_OS_ASSERTMESSAGE("Failed to set parallel extension since discontinuous logical engine.\n");
                            mos_gem_context_destroy(m_i915Context[i]);
                            m_i915Context[i] = nullptr;
                            break;
                        }
                    }
                }
            }
        }
        else if (GpuNode == MOS_GPU_NODE_BLT)
        {
            __u16 engine_class = I915_ENGINE_CLASS_COPY;
            __u64 caps = 0;

            if (mos_query_engines(osInterface->pOsContext->bufmgr, engine_class, caps, &nengine, engine_map))
            {
                MOS_OS_ASSERTMESSAGE("Failed to query engines.\n");
                MOS_SafeFreeMemory(engine_map);
                return MOS_STATUS_UNKNOWN;
            }

            if (mos_set_context_param_load_balance(m_i915Context[0], engine_map, nengine))
            {
                MOS_OS_ASSERTMESSAGE("Failed to set balancer extension.\n");
                MOS_SafeFreeMemory(engine_map);
                return MOS_STATUS_UNKNOWN;
            }
        }
        else
        {
            MOS_OS_ASSERTMESSAGE("Unknown engine class.\n");
            MOS_SafeFreeMemory(engine_map);
            return MOS_STATUS_UNKNOWN;
        }
        MOS_SafeFreeMemory(engine_map);
        MOS_TraceEventExt(EVENT_GPU_CONTEXT_CREATE, EVENT_TYPE_END,
                          m_i915Context, sizeof(void *),
                          &nengine, sizeof(nengine));
    }
    return MOS_STATUS_SUCCESS;
}

void GpuContextSpecific::Clear()
{
    MOS_OS_FUNCTION_ENTER;

    MOS_TraceEventExt(EVENT_GPU_CONTEXT_DESTROY, EVENT_TYPE_START,
                      m_i915Context, sizeof(void *), nullptr, 0);
    // hanlde the status buf bundled w/ the specified gpucontext
    if (m_statusBufferResource)
    {
        if (m_statusBufferResource->Unlock(m_osContext) != MOS_STATUS_SUCCESS)
        {
            MOS_OS_ASSERTMESSAGE("failed to unlock the status buf bundled w/ the specified gpucontext");
        }
        m_statusBufferResource->Free(m_osContext, 0);
        MOS_Delete(m_statusBufferResource);
    }
    MOS_FreeMemAndSetNull(m_statusBufferMosResource);

    MosUtilities::MosLockMutex(m_cmdBufPoolMutex);

    if (m_cmdBufMgr)
    {
        for (auto& curCommandBuffer : m_cmdBufPool)
        {
            auto curCommandBufferSpecific = static_cast<CommandBufferSpecific *>(curCommandBuffer);
            if (curCommandBufferSpecific == nullptr)
                continue;
            curCommandBufferSpecific->waitReady(); // wait ready and return to comamnd buffer manager.
            m_cmdBufMgr->ReleaseCmdBuf(curCommandBuffer);
        }
    }

    m_cmdBufPool.clear();

    MosUtilities::MosUnlockMutex(m_cmdBufPoolMutex);
    MosUtilities::MosDestroyMutex(m_cmdBufPoolMutex);
    m_cmdBufPoolMutex = nullptr;
    MOS_SafeFreeMemory(m_commandBuffer);
    MOS_SafeFreeMemory(m_allocationList);
    MOS_SafeFreeMemory(m_patchLocationList);
    MOS_SafeFreeMemory(m_attachedResources);
    MOS_SafeFreeMemory(m_writeModeList);
    MOS_SafeFreeMemory(m_createOptionEnhanced);

    for (int i=0; i<MAX_ENGINE_INSTANCE_NUM; i++)
    {
        if (m_i915Context[i])
        {
            mos_gem_context_destroy(m_i915Context[i]);
            m_i915Context[i] = nullptr;
        }
    }
    MOS_TraceEventExt(EVENT_GPU_CONTEXT_DESTROY, EVENT_TYPE_END,
                      nullptr, 0, nullptr, 0);
}

MOS_STATUS GpuContextSpecific::RegisterResource(
    PMOS_RESOURCE osResource,
    bool          writeFlag)
{
    MOS_OS_FUNCTION_ENTER;

    MOS_OS_CHK_NULL_RETURN(osResource);

    MOS_OS_CHK_NULL_RETURN(m_attachedResources);

    PMOS_RESOURCE registeredResources = m_attachedResources;
    uint32_t      allocationIndex     = 0;

    for ( allocationIndex = 0; allocationIndex < m_resCount; allocationIndex++, registeredResources++)
    {
        if (osResource->bo == registeredResources->bo)
        {
            break;
        }
    }

    // Allocation list to be updated
    if (allocationIndex < m_maxNumAllocations)
    {
        // New buffer
        if (allocationIndex == m_resCount)
        {
            m_resCount++;
        }

        // Set allocation
        if (m_gpuContext >= MOS_GPU_CONTEXT_MAX)
        {
            MOS_OS_ASSERTMESSAGE("Gpu context exceeds max.");
            return MOS_STATUS_UNKNOWN; 
        }

        osResource->iAllocationIndex[m_gpuContext] = (allocationIndex);
        m_attachedResources[allocationIndex]           = *osResource;
        m_writeModeList[allocationIndex] |= writeFlag;
        m_allocationList[allocationIndex].hAllocation = &m_attachedResources[allocationIndex];
        m_allocationList[allocationIndex].WriteOperation |= writeFlag;
        m_numAllocations = m_resCount;
    }
    else
    {
        MOS_OS_ASSERTMESSAGE("Reached max # registrations.");
        return MOS_STATUS_UNKNOWN;
    }

    return MOS_STATUS_SUCCESS;
}

MOS_STATUS GpuContextSpecific::SetPatchEntry(
    PMOS_INTERFACE          osInterface,
    PMOS_PATCH_ENTRY_PARAMS params)
{
    MOS_OS_FUNCTION_ENTER;

    MOS_OS_CHK_NULL_RETURN(m_patchLocationList);
    MOS_OS_CHK_NULL_RETURN(osInterface);
    MOS_OS_CHK_NULL_RETURN(params);

    m_patchLocationList[m_currentNumPatchLocations].AllocationIndex  = params->uiAllocationIndex;
    m_patchLocationList[m_currentNumPatchLocations].AllocationOffset = params->uiResourceOffset;
    m_patchLocationList[m_currentNumPatchLocations].PatchOffset      = params->uiPatchOffset;
    m_patchLocationList[m_currentNumPatchLocations].uiWriteOperation = params->bWrite ? true: false;
    m_patchLocationList[m_currentNumPatchLocations].cmdBo            =
                params->cmdBuffer != nullptr ? params->cmdBuffer->OsResource.bo : nullptr;

    if (osInterface->osCpInterface &&
        osInterface->osCpInterface->IsHMEnabled())
    {
        if (MOS_STATUS_SUCCESS != osInterface->osCpInterface->RegisterPatchForHM(
            (uint32_t *)(params->cmdBufBase + params->uiPatchOffset),
            params->bWrite,
            params->HwCommandType,
            params->forceDwordOffset,
            params->presResource,
            &m_patchLocationList[m_currentNumPatchLocations]))
        {
            MOS_OS_ASSERTMESSAGE("Failed to RegisterPatchForHM.");
        }
    }

    m_currentNumPatchLocations++;

    return MOS_STATUS_SUCCESS;
}

MOS_STATUS GpuContextSpecific::GetCommandBuffer(
    PMOS_COMMAND_BUFFER comamndBuffer,
    uint32_t            flags)
{
    MOS_OS_FUNCTION_ENTER;

    MOS_OS_CHK_NULL_RETURN(comamndBuffer);
    MOS_OS_CHK_NULL_RETURN(m_cmdBufMgr);
    MOS_OS_CHK_NULL_RETURN(m_commandBuffer);

    MOS_STATUS      eStatus = MOS_STATUS_SUCCESS;
    CommandBuffer* cmdBuf = nullptr;

    uint32_t secondaryIdx = flags;
    bool isPrimaryCmdBuffer = (secondaryIdx == 0);
    bool hasSecondaryCmdBuffer = (!isPrimaryCmdBuffer &&
                               (m_secondaryCmdBufs.count(secondaryIdx) != 0));

    bool needToAlloc = ((isPrimaryCmdBuffer && m_cmdBufFlushed) ||
                        (!isPrimaryCmdBuffer && !hasSecondaryCmdBuffer));

    if (needToAlloc)
    {
        MosUtilities::MosLockMutex(m_cmdBufPoolMutex);
        if (m_cmdBufPool.size() < MAX_CMD_BUF_NUM)
        {
            cmdBuf = m_cmdBufMgr->PickupOneCmdBuf(m_commandBufferSize);
            if (cmdBuf == nullptr)
            {
                MOS_OS_ASSERTMESSAGE("Invalid (nullptr) Pointer.");
                MosUtilities::MosUnlockMutex(m_cmdBufPoolMutex);
                return MOS_STATUS_NULL_POINTER;
            }
            if ((eStatus = cmdBuf->BindToGpuContext(this)) != MOS_STATUS_SUCCESS)
            {
                MOS_OS_ASSERTMESSAGE("Invalid status of BindToGpuContext.");
                MosUtilities::MosUnlockMutex(m_cmdBufPoolMutex);
                return eStatus;
            }
            m_cmdBufPool.push_back(cmdBuf);
        }
        else if (m_cmdBufPool.size() == MAX_CMD_BUF_NUM && m_nextFetchIndex < m_cmdBufPool.size())
        {
            auto cmdBufOld = m_cmdBufPool[m_nextFetchIndex];
            auto cmdBufSpecificOld = static_cast<CommandBufferSpecific *>(cmdBufOld);
            if (cmdBufSpecificOld == nullptr)
            {
                MOS_OS_ASSERTMESSAGE("Invalid (nullptr) Pointer.");
                MosUtilities::MosUnlockMutex(m_cmdBufPoolMutex);
                return MOS_STATUS_NULL_POINTER;
            }
            cmdBufSpecificOld->waitReady();
            cmdBufSpecificOld->UnBindToGpuContext();
            m_cmdBufMgr->ReleaseCmdBuf(cmdBufOld);  // here just return old command buffer to available pool

            //pick up new comamnd buffer
            cmdBuf = m_cmdBufMgr->PickupOneCmdBuf(m_commandBufferSize);
            if (cmdBuf == nullptr)
            {
                MOS_OS_ASSERTMESSAGE("Invalid (nullptr) Pointer.");
                MosUtilities::MosUnlockMutex(m_cmdBufPoolMutex);
                return MOS_STATUS_NULL_POINTER;
            }
            if ((eStatus = cmdBuf->BindToGpuContext(this)) != MOS_STATUS_SUCCESS)
            {
                MOS_OS_ASSERTMESSAGE("Invalid status of BindToGpuContext.");
                MosUtilities::MosUnlockMutex(m_cmdBufPoolMutex);
                return eStatus;
            }
            m_cmdBufPool[m_nextFetchIndex] = cmdBuf;
        }
        else
        {
            MOS_OS_ASSERTMESSAGE("Command buffer bool size exceed max.");
            MosUtilities::MosUnlockMutex(m_cmdBufPoolMutex);
            return MOS_STATUS_UNKNOWN;
        }
        MosUtilities::MosUnlockMutex(m_cmdBufPoolMutex);

        // util now, we got new command buffer from CmdBufMgr, next step to fill in the input command buffer
        MOS_OS_CHK_STATUS_RETURN(cmdBuf->GetResource()->ConvertToMosResource(&comamndBuffer->OsResource));
        comamndBuffer->pCmdBase   = (uint32_t *)cmdBuf->GetLockAddr();
        comamndBuffer->pCmdPtr    = (uint32_t *)cmdBuf->GetLockAddr();
        comamndBuffer->iOffset    = 0;
        comamndBuffer->iRemaining = cmdBuf->GetCmdBufSize();
        comamndBuffer->iCmdIndex  = m_nextFetchIndex;
        comamndBuffer->iVdboxNodeIndex = MOS_VDBOX_NODE_INVALID;
        comamndBuffer->iVeboxNodeIndex = MOS_VEBOX_NODE_INVALID;
        comamndBuffer->is1stLvlBB = true;
        comamndBuffer->Attributes.pAttriVe = nullptr;

        // zero comamnd buffer
        MOS_ZeroMemory(comamndBuffer->pCmdBase, comamndBuffer->iRemaining);
        comamndBuffer->iSubmissionType = SUBMISSION_TYPE_SINGLE_PIPE;
        MOS_ZeroMemory(&comamndBuffer->Attributes,sizeof(comamndBuffer->Attributes));

        if (isPrimaryCmdBuffer)
        {
            // update command buffer relared filed in GPU context
            m_cmdBufFlushed = false;

            // keep a copy in GPU context
            MOS_SecureMemcpy(m_commandBuffer, sizeof(MOS_COMMAND_BUFFER), comamndBuffer, sizeof(MOS_COMMAND_BUFFER));
        }
        else
        {
            PMOS_COMMAND_BUFFER tempCmdBuf = (PMOS_COMMAND_BUFFER)MOS_AllocAndZeroMemory(sizeof(MOS_COMMAND_BUFFER));
            MOS_OS_CHK_NULL_RETURN(tempCmdBuf);
            m_secondaryCmdBufs[secondaryIdx] = tempCmdBuf;
            MOS_SecureMemcpy(tempCmdBuf, sizeof(MOS_COMMAND_BUFFER), comamndBuffer, sizeof(MOS_COMMAND_BUFFER));
        }

        // Command buffers are treated as cyclical buffers, the CB after the just submitted one
        // has the minimal fence value that we should wait
        m_nextFetchIndex++;
        if (m_nextFetchIndex >= MAX_CMD_BUF_NUM)
        {
            m_nextFetchIndex = 0;
        }
    }
    else
    {
        // current command buffer still active, directly copy to comamndBuffer
        if (isPrimaryCmdBuffer)
        {
            MOS_SecureMemcpy(comamndBuffer, sizeof(MOS_COMMAND_BUFFER), m_commandBuffer, sizeof(MOS_COMMAND_BUFFER));
        }
        else
        {
            MOS_SecureMemcpy(comamndBuffer, sizeof(MOS_COMMAND_BUFFER), m_secondaryCmdBufs[secondaryIdx], sizeof(MOS_COMMAND_BUFFER));
        }
    }

    if (isPrimaryCmdBuffer)
    {
        MOS_OS_CHK_STATUS_RETURN(RegisterResource(&m_commandBuffer->OsResource, false));
    }
    else
    {
        MOS_OS_CHK_STATUS_RETURN(RegisterResource(&m_secondaryCmdBufs[secondaryIdx]->OsResource, false));
    }

    return MOS_STATUS_SUCCESS;
}

void GpuContextSpecific::ReturnCommandBuffer(
    PMOS_COMMAND_BUFFER cmdBuffer,
    uint32_t            flags)
{
    MOS_OS_FUNCTION_ENTER;

    MOS_OS_ASSERT(cmdBuffer);
    MOS_OS_ASSERT(m_commandBuffer);

    bool isPrimaryCmdBuf = (flags == 0);

    if (isPrimaryCmdBuf)
    {
        m_commandBuffer->iOffset    = cmdBuffer->iOffset;
        m_commandBuffer->iRemaining = cmdBuffer->iRemaining;
        m_commandBuffer->pCmdPtr    = cmdBuffer->pCmdPtr;
        m_commandBuffer->iVdboxNodeIndex = cmdBuffer->iVdboxNodeIndex;
        m_commandBuffer->iVeboxNodeIndex = cmdBuffer->iVeboxNodeIndex;
    }
    else
    {
        uint32_t secondaryIdx = flags;
        MOS_OS_ASSERT(m_secondaryCmdBufs.count(secondaryIdx));

        MOS_SecureMemcpy(m_secondaryCmdBufs[secondaryIdx], sizeof(MOS_COMMAND_BUFFER), cmdBuffer, sizeof(MOS_COMMAND_BUFFER));
    }
}

MOS_STATUS GpuContextSpecific::ResetCommandBuffer()
{
    m_cmdBufFlushed = true;
    auto it = m_secondaryCmdBufs.begin();
    while(it != m_secondaryCmdBufs.end())
    {
        MOS_FreeMemory(it->second);
        it++;
    }
    m_secondaryCmdBufs.clear();
    return MOS_STATUS_SUCCESS;
}

MOS_STATUS GpuContextSpecific::SetIndirectStateSize(const uint32_t size)
{
    if(size < m_commandBufferSize)
    {
        m_IndirectHeapSize = size;
        return MOS_STATUS_SUCCESS;
    }
    else
    {
        MOS_OS_ASSERTMESSAGE("Indirect State Size if out of boundry!");
        return MOS_STATUS_UNKNOWN;
    }
}

MOS_STATUS GpuContextSpecific::GetIndirectState(
    uint32_t *offset,
    uint32_t *size)
{
    MOS_OS_FUNCTION_ENTER;

    if (offset)
    {
        *offset = m_commandBufferSize - m_IndirectHeapSize;
    }

    if (size)
    {
        *size = m_IndirectHeapSize;
    }

    return MOS_STATUS_SUCCESS;
}

MOS_STATUS GpuContextSpecific::GetIndirectStatePointer(
    uint8_t **indirectState)
{
    MOS_OS_FUNCTION_ENTER;

    MOS_OS_CHK_NULL_RETURN(indirectState);

    *indirectState = (uint8_t *)m_commandBuffer->pCmdBase + m_commandBufferSize - m_IndirectHeapSize;

    return MOS_STATUS_SUCCESS;
}

MOS_STATUS GpuContextSpecific::ResizeCommandBufferAndPatchList(
    uint32_t requestedCommandBufferSize,
    uint32_t requestedPatchListSize,
    uint32_t flags)
{
    MOS_OS_FUNCTION_ENTER;

    // m_commandBufferSize is used for allocate command buffer and submit command buffer, in this moment, command buffer has not allocated yet.
    // Linux KMD requires command buffer size align to 8 bytes, or it will not execute the commands.
    m_commandBufferSize = MOS_ALIGN_CEIL(requestedCommandBufferSize, 8);

    if (requestedPatchListSize > m_maxPatchLocationsize)
    {
        PPATCHLOCATIONLIST newPatchList = (PPATCHLOCATIONLIST)realloc(m_patchLocationList, sizeof(PATCHLOCATIONLIST) * requestedPatchListSize);
        MOS_OS_CHK_NULL_RETURN(newPatchList);

        m_patchLocationList = newPatchList;

        // now zero the extended portion
        MOS_ZeroMemory((m_patchLocationList + m_maxPatchLocationsize), sizeof(PATCHLOCATIONLIST) * (requestedPatchListSize - m_maxPatchLocationsize));
        m_maxPatchLocationsize = requestedPatchListSize;
    }

    return MOS_STATUS_SUCCESS;
}

MOS_STATUS GpuContextSpecific::ResizeCommandBuffer(uint32_t requestedSize)
{
    MOS_OS_FUNCTION_ENTER;

    m_commandBufferSize = requestedSize;

    return MOS_STATUS_SUCCESS;
}

uint32_t GetVcsExecFlag(PMOS_INTERFACE osInterface,
                            PMOS_COMMAND_BUFFER cmdBuffer,
                            MOS_GPU_NODE gpuNode)
{
    if (osInterface == 0 ||
        cmdBuffer == 0)
    {
        MOS_OS_ASSERTMESSAGE("Input invalid(null) parameter.");
        return I915_EXEC_DEFAULT;
    }

    uint32_t vcsExecFlag = I915_EXEC_BSD | I915_EXEC_BSD_RING1;

    if (MOS_VDBOX_NODE_INVALID == cmdBuffer->iVdboxNodeIndex)
    {
       // That's those case when BB did not have any VDBOX# specific commands.
       // Thus, we need to select VDBOX# here. Alternatively we can rely on KMD
       // to make balancing for us, i.e. rely on Virtual Engine support.
       cmdBuffer->iVdboxNodeIndex = osInterface->pfnGetVdboxNodeId(osInterface, cmdBuffer);
       if (MOS_VDBOX_NODE_INVALID == cmdBuffer->iVdboxNodeIndex)
       {
           cmdBuffer->iVdboxNodeIndex = (gpuNode == MOS_GPU_NODE_VIDEO)?
               MOS_VDBOX_NODE_1: MOS_VDBOX_NODE_2;
       }
     }

     if (MOS_VDBOX_NODE_1 == cmdBuffer->iVdboxNodeIndex)
     {
         vcsExecFlag = I915_EXEC_BSD | I915_EXEC_BSD_RING1;
     }
     else if (MOS_VDBOX_NODE_2 == cmdBuffer->iVdboxNodeIndex)
     {
         vcsExecFlag = I915_EXEC_BSD | I915_EXEC_BSD_RING2;
     }

     return vcsExecFlag;
}

MOS_STATUS GpuContextSpecific::MapResourcesToAuxTable(mos_linux_bo *cmd_bo)
{
    MOS_OS_CHK_NULL_RETURN(cmd_bo);

    OsContextSpecific *osCtx = static_cast<OsContextSpecific*>(m_osContext);
    MOS_OS_CHK_NULL_RETURN(osCtx);

    AuxTableMgr *auxTableMgr = osCtx->GetAuxTableMgr();
    if (auxTableMgr)
    {
        // Map compress allocations to aux table if it is not mapped.
        for (uint32_t i = 0; i < m_numAllocations; i++)
        {
            auto res = (PMOS_RESOURCE)m_allocationList[i].hAllocation;
            MOS_OS_CHK_NULL_RETURN(res);
            MOS_OS_CHK_STATUS_RETURN(auxTableMgr->MapResource(res->pGmmResInfo, res->bo));
        }
        MOS_OS_CHK_STATUS_RETURN(auxTableMgr->EmitAuxTableBOList(cmd_bo));
    }
    return MOS_STATUS_SUCCESS;
}

MOS_STATUS GpuContextSpecific::SubmitCommandBuffer(
    PMOS_INTERFACE      osInterface,
    PMOS_COMMAND_BUFFER cmdBuffer,
    bool                nullRendering)
{
    MOS_OS_FUNCTION_ENTER;

    MOS_TraceEventExt(EVENT_MOS_BATCH_SUBMIT, EVENT_TYPE_START, nullptr, 0, nullptr, 0);

    MOS_OS_CHK_NULL_RETURN(osInterface);
    PMOS_CONTEXT osContext = osInterface->pOsContext;
    MOS_OS_CHK_NULL_RETURN(osContext);
    MOS_OS_CHK_NULL_RETURN(cmdBuffer);
    MOS_OS_CHK_NULL_RETURN(m_patchLocationList);

    MOS_GPU_NODE gpuNode  = OSKMGetGpuNode(m_gpuContext);
    uint32_t     execFlag = gpuNode;
    MOS_STATUS   eStatus  = MOS_STATUS_SUCCESS;
    int32_t      ret      = 0;
    bool         scalaEnabled = false;
    auto         it           = m_secondaryCmdBufs.begin();

    // Command buffer object DRM pointer
    m_cmdBufFlushed = true;
    auto cmd_bo     = cmdBuffer->OsResource.bo;

    // Map Resource to Aux if needed
    MapResourcesToAuxTable(cmd_bo);
    for(auto it : m_secondaryCmdBufs)
    {
        MapResourcesToAuxTable(it.second->OsResource.bo);
    }

    if (m_secondaryCmdBufs.size() >= 2)
    {
        scalaEnabled = true;
        cmdBuffer->iSubmissionType = SUBMISSION_TYPE_MULTI_PIPE_MASTER;
    }

    std::vector<PMOS_RESOURCE> mappedResList;
    std::vector<MOS_LINUX_BO *> skipSyncBoList;

    // Now, the patching will be done, based on the patch list.
    for (uint32_t patchIndex = 0; patchIndex < m_currentNumPatchLocations; patchIndex++)
    {
        auto currentPatch = &m_patchLocationList[patchIndex];
        MOS_OS_CHK_NULL_RETURN(currentPatch);

        auto tempCmdBo = currentPatch->cmdBo == nullptr ? cmd_bo : currentPatch->cmdBo;

        // Following are for Nested BB buffer, if it's nested BB, we need to ensure it's locked.
        if (tempCmdBo != cmd_bo)
        {
            bool isSecondaryCmdBuf = false;
            it = m_secondaryCmdBufs.begin();
            while(it != m_secondaryCmdBufs.end())
            {
                if (it->second->OsResource.bo == tempCmdBo)
                {
                    isSecondaryCmdBuf = true;
                    break;
                }
                it++;
            }

            for(auto allocIdx = 0; allocIdx < m_numAllocations && (!isSecondaryCmdBuf); allocIdx++)
            {
                auto tempRes = (PMOS_RESOURCE)m_allocationList[allocIdx].hAllocation;
                if (tempCmdBo == tempRes->bo)
                {
                    GraphicsResource::LockParams param;
                    param.m_writeRequest = true;
                    tempRes->pGfxResource->Lock(m_osContext, param);
                    mappedResList.push_back(tempRes);
                    break;
                }
            }
        }

        // This is the resource for which patching will be done
        auto resource = (PMOS_RESOURCE)m_allocationList[currentPatch->AllocationIndex].hAllocation;
        MOS_OS_CHK_NULL_RETURN(resource);

        // For now, we'll assume the system memory's DRM bo pointer
        // is NULL.  If nullptr is detected, then the resource has been
        // placed inside the command buffer's indirect state area.
        // We'll simply set alloc_bo to the command buffer's bo pointer.
        MOS_OS_ASSERT(resource->bo);

        auto alloc_bo = (resource->bo) ? resource->bo : tempCmdBo;

        MOS_OS_CHK_STATUS_RETURN(osInterface->osCpInterface->PermeatePatchForHM(
            tempCmdBo->virt,
            currentPatch,
            resource));

        uint64_t boOffset = alloc_bo->offset64;
        if (alloc_bo != tempCmdBo)
        {
            auto item_ctx = osContext->contextOffsetList.begin();
            for (; item_ctx != osContext->contextOffsetList.end(); item_ctx++)
            {
                if (item_ctx->intel_context == osContext->intel_context && item_ctx->target_bo == alloc_bo)
                {
                    boOffset = item_ctx->offset64;
                    break;
                }
            }
        }

        if (osContext->bUse64BitRelocs)
        {
            *((uint64_t *)((uint8_t *)tempCmdBo->virt + currentPatch->PatchOffset)) =
                    boOffset + currentPatch->AllocationOffset;
        }
        else
        {
            *((uint32_t *)((uint8_t *)tempCmdBo->virt + currentPatch->PatchOffset)) =
                    boOffset + currentPatch->AllocationOffset;
        }

        if (scalaEnabled)
        {
            it = m_secondaryCmdBufs.begin();
            while(it != m_secondaryCmdBufs.end())
            {
                if (it->second->OsResource.bo == tempCmdBo &&
                    it->second->iSubmissionType & SUBMISSION_TYPE_MULTI_PIPE_SLAVE &&
                    !mos_gem_bo_is_exec_object_async(alloc_bo))
                {
                    skipSyncBoList.push_back(alloc_bo);
                    break;
                }
                it++;
            }
        }
        else if (cmdBuffer->iSubmissionType & SUBMISSION_TYPE_MULTI_PIPE_SLAVE &&
                !mos_gem_bo_is_exec_object_async(alloc_bo))
        {
            skipSyncBoList.push_back(alloc_bo);
        }

#if (_DEBUG || _RELEASE_INTERNAL)
        {
            uint32_t evtData[] = {alloc_bo->handle, currentPatch->uiWriteOperation, currentPatch->AllocationOffset};
            MOS_TraceEventExt(EVENT_MOS_BATCH_SUBMIT, EVENT_TYPE_INFO,
                              evtData, sizeof(evtData),
                              &boOffset, sizeof(boOffset));
        }
#endif
        if(mos_gem_bo_is_softpin(alloc_bo))
        {
            if (alloc_bo != tempCmdBo)
            {
                ret = mos_bo_add_softpin_target(tempCmdBo, alloc_bo, currentPatch->uiWriteOperation);
            }
        }
        else
        {
            // This call will patch the command buffer with the offsets of the indirect state region of the command buffer
            ret = mos_bo_emit_reloc2(
                tempCmdBo,                                                         // Command buffer
                currentPatch->PatchOffset,                                         // Offset in the command buffer
                alloc_bo,                                                          // Allocation object for which the patch will be made.
                currentPatch->AllocationOffset,                                    // Offset to the indirect state
                I915_GEM_DOMAIN_RENDER,                                            // Read domain
                (currentPatch->uiWriteOperation) ? I915_GEM_DOMAIN_RENDER : 0x0,   // Write domain
                boOffset);
        }

        if (ret != 0)
        {
            MOS_OS_ASSERTMESSAGE("Error patching alloc_bo = 0x%x, cmd_bo = 0x%x.",
                (uintptr_t)alloc_bo,
                (uintptr_t)tempCmdBo);
            return MOS_STATUS_UNKNOWN;
        }
    }

    for(auto res: mappedResList)
    {
        res->pGfxResource->Unlock(m_osContext);
    }
    mappedResList.clear();

    if (scalaEnabled)
    {
         it = m_secondaryCmdBufs.begin();
         while(it != m_secondaryCmdBufs.end())
         {
             //Add Batch buffer End Command
             uint32_t batchBufferEndCmd = MI_BATCHBUFFER_END;
             if (MOS_FAILED(Mos_AddCommand(
                     it->second,
                     &batchBufferEndCmd,
                     sizeof(uint32_t))))
             {
                 MOS_OS_ASSERTMESSAGE("Inserting BB_END failed!");
                 return MOS_STATUS_UNKNOWN;
             }
             it++;
         }
    }
    else
    {
        //Add Batch buffer End Command
        uint32_t batchBufferEndCmd = MI_BATCHBUFFER_END;
        if (MOS_FAILED(Mos_AddCommand(
                cmdBuffer,
                &batchBufferEndCmd,
                sizeof(uint32_t))))
        {
            MOS_OS_ASSERTMESSAGE("Inserting BB_END failed!");
            return MOS_STATUS_UNKNOWN;
        }
    }

    // Now, we can unmap the video command buffer, since we don't need CPU access anymore.
    MOS_OS_CHK_NULL_RETURN(cmdBuffer->OsResource.pGfxResource);
    cmdBuffer->OsResource.pGfxResource->Unlock(m_osContext);

    it = m_secondaryCmdBufs.begin();
    while(it != m_secondaryCmdBufs.end())
    {
        MOS_OS_CHK_NULL_RETURN(it->second->OsResource.pGfxResource);
        it->second->OsResource.pGfxResource->Unlock(m_osContext);

        it++;
    }

    int32_t perfData;
    if (osContext->pPerfData != nullptr)
    {
        perfData = *(int32_t *)(osContext->pPerfData);
    }
    else
    {
        perfData = 0;
    }

    drm_clip_rect_t *cliprects     = nullptr;
    int32_t          num_cliprects = 0;
    int32_t          DR4           = osContext->uEnablePerfTag ? perfData : 0;

    //Since CB2 command is not supported, remove it and set cliprects to nullprt as default.
    if ((gpuNode == MOS_GPU_NODE_VIDEO || gpuNode == MOS_GPU_NODE_VIDEO2) &&
        (cmdBuffer->iSubmissionType & SUBMISSION_TYPE_SINGLE_PIPE_MASK))
    {
        if (osContext->bKMDHasVCS2)
        {
            if (osContext->bPerCmdBufferBalancing && osInterface->pfnGetVdboxNodeId)
            {
                execFlag = GetVcsExecFlag(osInterface, cmdBuffer, gpuNode);
            }
            else if (gpuNode == MOS_GPU_NODE_VIDEO)
            {
                execFlag = I915_EXEC_BSD | I915_EXEC_BSD_RING1;
            }
            else if (gpuNode == MOS_GPU_NODE_VIDEO2)
            {
                execFlag = I915_EXEC_BSD | I915_EXEC_BSD_RING2;
            }
            else if ((gpuNode == MOS_GPU_NODE_BLT))
            {
                execFlag = I915_EXEC_BLT;
            }
            else
            {
                MOS_OS_ASSERTMESSAGE("Invalid gpuNode.");
            }
        }
        else
        {
            execFlag = I915_EXEC_BSD | I915_EXEC_BSD_RING1;
        }
    }

#if (_DEBUG || _RELEASE_INTERNAL)

    MOS_LINUX_BO *bad_cmd_bo = nullptr;
    MOS_LINUX_BO *nop_cmd_bo = nullptr;
    uint32_t      dwComponentTag = 0;
    uint32_t      dwCallType = 0;

    //dwComponentTag 3: decode,5: vpp,6: encode
    //dwCallType     8: PAK(CODECHAL_ENCODE_PERFTAG_CALL_PAK_ENGINE)
    //             34: PREENC
    //             5: VPP
    dwComponentTag = (perfData & 0xF000) >> 12;
    dwCallType     = (perfData & 0xFC) >> 2;

    if (osInterface->bTriggerCodecHang &&
        (dwComponentTag == 3 || (dwComponentTag == 6 && dwCallType == 8) ||
            (dwComponentTag == 6 && dwCallType == 34) ||
            (dwComponentTag == 5 && dwCallType == 5)))
    {
        bad_cmd_bo = Mos_GetBadCommandBuffer_Linux(osInterface);
        if (bad_cmd_bo)
        {
            ret = mos_bo_mrb_exec(bad_cmd_bo,
                4096,
                nullptr,
                0,
                0,
                execFlag);
        }
        else
        {
            MOS_OS_ASSERTMESSAGE("Mos_GetBadCommandBuffer_Linux failed!");
        }
    }
    else if (osInterface->bTriggerVPHang == true)
    {
        bad_cmd_bo = Mos_GetBadCommandBuffer_Linux(osInterface);

        if (bad_cmd_bo)
        {
            ret = mos_bo_mrb_exec(bad_cmd_bo,
                4096,
                nullptr,
                0,
                0,
                execFlag);
        }
        else
        {
            MOS_OS_ASSERTMESSAGE("Mos_GetBadCommandBuffer_Linux failed!");
        }

        osInterface->bTriggerVPHang = false;
    }

    nop_cmd_bo = nullptr;
    if (nullRendering == true)
    {
        nop_cmd_bo = Mos_GetNopCommandBuffer_Linux(osInterface);

        if (nop_cmd_bo)
        {
            ret = mos_bo_mrb_exec(nop_cmd_bo,
                4096,
                nullptr,
                0,
                0,
                execFlag);
        }
        else
        {
            MOS_OS_ASSERTMESSAGE("Mos_GetNopCommandBuffer_Linux failed!");
        }
    }

#endif  //(_DEBUG || _RELEASE_INTERNAL)

    if (gpuNode != I915_EXEC_RENDER &&
        osInterface->osCpInterface->IsTearDownHappen())
    {
        // skip PAK command when CP tear down happen to avoid of GPU hang
        // conditonal batch buffer start PoC is in progress
    }
    else if (nullRendering == false)
    {
        if (osInterface->ctxBasedScheduling && m_i915Context[0] != nullptr)
        {
            if (cmdBuffer->iSubmissionType & SUBMISSION_TYPE_MULTI_PIPE_MASK)
            {
                if (scalaEnabled && !osInterface->bGucSubmission)
                {
                    uint32_t secondaryIndex = 0;
                    it = m_secondaryCmdBufs.begin();
                    while(it != m_secondaryCmdBufs.end())
                    {
                        if (it->second->iSubmissionType & SUBMISSION_TYPE_MULTI_PIPE_SLAVE)
                        {
                            if(execFlag == MOS_GPU_NODE_VE)
                            {
                                // decode excluded since init in other place
                                it->second->iSubmissionType |= (secondaryIndex << SUBMISSION_TYPE_MULTI_PIPE_SLAVE_INDEX_SHIFT);
                                secondaryIndex++;
                            }
                        }

                        ret = SubmitPipeCommands(it->second,
                                                 it->second->OsResource.bo,
                                                 osContext,
                                                 skipSyncBoList,
                                                 execFlag,
                                                 DR4);
                        it++;
                    }
                }
                else if(scalaEnabled && osInterface->bGucSubmission)
                {
                    ret = ParallelSubmitCommands(m_secondaryCmdBufs,
                                         osContext,
                                         execFlag,
                                         DR4);
                }
                else
                {
                    ret = SubmitPipeCommands(cmdBuffer,
                                             cmd_bo,
                                             osContext,
                                             skipSyncBoList,
                                             execFlag,
                                             DR4);
                }
            }
            else
            {
                ret = mos_gem_bo_context_exec2(cmd_bo,
                    m_commandBufferSize,
                    m_i915Context[0],
                    cliprects,
                    num_cliprects,
                    DR4,
                    m_i915ExecFlag,
                    nullptr);
            }
        }
        else
        {
            ret = mos_gem_bo_context_exec2(cmd_bo,
                m_commandBufferSize,
                osContext->intel_context,
                cliprects,
                num_cliprects,
                DR4,
                execFlag,
                nullptr);
        }
        if (ret != 0)
        {
            eStatus = MOS_STATUS_UNKNOWN;
        }
    }

    if (eStatus != MOS_STATUS_SUCCESS)
    {
        MOS_OS_ASSERTMESSAGE("Command buffer submission failed!");
    }

    MOS_DEVULT_FuncCall(pfnUltGetCmdBuf, cmdBuffer);

#if MOS_COMMAND_BUFFER_DUMP_SUPPORTED
pthread_mutex_lock(&command_dump_mutex);
if (osInterface->bDumpCommandBuffer)
    {
        if (scalaEnabled)
        {
            it = m_secondaryCmdBufs.begin();
            while(it != m_secondaryCmdBufs.end())
            {
                mos_bo_map(it->second->OsResource.bo, 0);
                osInterface->pfnDumpCommandBuffer(osInterface, it->second);
                mos_bo_unmap(it->second->OsResource.bo);
                it++;
            }
        }
        else
        {
            mos_bo_map(cmd_bo, 0);
            osInterface->pfnDumpCommandBuffer(osInterface, cmdBuffer);
            mos_bo_unmap(cmd_bo);
        }
    }
    pthread_mutex_unlock(&command_dump_mutex);
#endif  // MOS_COMMAND_BUFFER_DUMP_SUPPORTED

#if (_DEBUG || _RELEASE_INTERNAL)
    if (bad_cmd_bo)
    {
        mos_bo_wait_rendering(bad_cmd_bo);
        mos_bo_unreference(bad_cmd_bo);
    }
    if (nop_cmd_bo)
    {
        mos_bo_unreference(nop_cmd_bo);
    }
#endif  //(_DEBUG || _RELEASE_INTERNAL)

    //clear command buffer relocations to fix memory leak issue
    mos_gem_bo_clear_relocs(cmd_bo, 0);
    it = m_secondaryCmdBufs.begin();
    while(it != m_secondaryCmdBufs.end())
    {
        mos_gem_bo_clear_relocs(it->second->OsResource.bo, 0);
        MOS_FreeMemory(it->second);
        it++;
    }
    m_secondaryCmdBufs.clear();
    skipSyncBoList.clear();

    // Reset resource allocation
    m_numAllocations = 0;
    MOS_ZeroMemory(m_allocationList, sizeof(ALLOCATION_LIST) * m_maxNumAllocations);
    m_currentNumPatchLocations = 0;
    MOS_ZeroMemory(m_patchLocationList, sizeof(PATCHLOCATIONLIST) * m_maxNumAllocations);
    m_resCount = 0;

    MOS_ZeroMemory(m_writeModeList, sizeof(bool) * m_maxNumAllocations);
finish:
    MOS_TraceEventExt(EVENT_MOS_BATCH_SUBMIT, EVENT_TYPE_END, &eStatus, sizeof(eStatus), nullptr, 0);
    return eStatus;
}

int32_t GpuContextSpecific::SubmitPipeCommands(
    MOS_COMMAND_BUFFER *cmdBuffer,
    MOS_LINUX_BO *cmdBo,
    PMOS_CONTEXT osContext,
    const std::vector<MOS_LINUX_BO *> &skipSyncBoList,
    uint32_t execFlag,
    int32_t dr4)
{
    int32_t      ret        = 0;
    int          fence      = -1;
    unsigned int fenceFlag = 0;

    MOS_LINUX_CONTEXT *queue = m_i915Context[0];
    bool isVeboxSubmission   = false;

    if (execFlag == MOS_GPU_NODE_VIDEO || execFlag == MOS_GPU_NODE_VIDEO2)
    {
        execFlag = I915_EXEC_DEFAULT;
    }
    if (execFlag == MOS_GPU_NODE_VE)
    {
        execFlag = I915_EXEC_DEFAULT;
        isVeboxSubmission = true;
    }

    if(cmdBuffer->iSubmissionType & SUBMISSION_TYPE_MULTI_PIPE_SLAVE)
    {
        fence = osContext->submit_fence;
        fenceFlag = I915_EXEC_FENCE_SUBMIT;
        int slaveIndex = (cmdBuffer->iSubmissionType & SUBMISSION_TYPE_MULTI_PIPE_SLAVE_INDEX_MASK) >> SUBMISSION_TYPE_MULTI_PIPE_SLAVE_INDEX_SHIFT;
        if(slaveIndex < 7)
        {
            queue = m_i915Context[2 + slaveIndex]; //0 is for single pipe, 1 is for master, slave starts from 2
        }
        else
        {
            MOS_OS_ASSERTMESSAGE("slaveIndex value: %s is invalid!", slaveIndex);
            return -1;
        }

        if (isVeboxSubmission)
        {
            queue = m_i915Context[cmdBuffer->iVeboxNodeIndex + 1];
        }

        for(auto bo: skipSyncBoList)
        {
            mos_bo_set_exec_object_async(cmdBo, bo);
        }
    }

    //Keep FE and BE0 running on same engine for VT decode
    if((cmdBuffer->iSubmissionType & SUBMISSION_TYPE_MULTI_PIPE_ALONE)
        || (cmdBuffer->iSubmissionType & SUBMISSION_TYPE_MULTI_PIPE_MASTER))
    {
        if(cmdBuffer->iSubmissionType & SUBMISSION_TYPE_MULTI_PIPE_MASTER)
        {
            //Only master pipe needs fence out flag
            fenceFlag = I915_EXEC_FENCE_OUT;
        }
        queue = m_i915Context[1];
    }

    ret = mos_gem_bo_context_exec2(cmdBo,
                                  cmdBo->size,
                                  queue,
                                  nullptr,
                                  0,
                                  dr4,
                                  execFlag | fenceFlag,
                                  &fence);

    if(cmdBuffer->iSubmissionType & SUBMISSION_TYPE_MULTI_PIPE_MASTER)
    {
        osContext->submit_fence = fence;
    }

    if(cmdBuffer->iSubmissionType & SUBMISSION_TYPE_MULTI_PIPE_FLAGS_LAST_PIPE)
    {
        close(fence);
    }

    return ret;
}

int32_t GpuContextSpecific::ParallelSubmitCommands(
    std::map<uint32_t, PMOS_COMMAND_BUFFER> secondaryCmdBufs,
    PMOS_CONTEXT osContext,
    uint32_t execFlag,
    int32_t dr4)
{
    int32_t      ret        = 0;
    int          fence      = -1;
    unsigned int fenceFlag  = 0;
    auto         it         = m_secondaryCmdBufs.begin();
    MOS_LINUX_BO *cmdBos[MAX_PARALLEN_CMD_BO_NUM];
    int          numBos     = 0; // exclude FE bo

    MOS_LINUX_CONTEXT *queue = m_i915Context[0];
    bool isVeboxSubmission   = false;

    if (execFlag == MOS_GPU_NODE_VIDEO || execFlag == MOS_GPU_NODE_VIDEO2)
    {
        execFlag = I915_EXEC_DEFAULT;
    }
    if (execFlag == MOS_GPU_NODE_VE)
    {
        execFlag = I915_EXEC_DEFAULT;
        isVeboxSubmission = true;
    }

    while(it != m_secondaryCmdBufs.end())
    {
        if(it->second->iSubmissionType & SUBMISSION_TYPE_MULTI_PIPE_ALONE)
        {
            fenceFlag = I915_EXEC_FENCE_OUT;
            queue = m_i915Context[0];

            ret = mos_gem_bo_context_exec2(it->second->OsResource.bo,
                                  it->second->OsResource.bo->size,
                                  queue,
                                  nullptr,
                                  0,
                                  dr4,
                                  execFlag | fenceFlag,
                                  &fence);

            osContext->submit_fence = fence;
        }

        if((it->second->iSubmissionType & SUBMISSION_TYPE_MULTI_PIPE_MASTER)
            || (it->second->iSubmissionType & SUBMISSION_TYPE_MULTI_PIPE_SLAVE))
        {
            cmdBos[numBos++] = it->second->OsResource.bo;

            if(it->second->iSubmissionType & SUBMISSION_TYPE_MULTI_PIPE_FLAGS_LAST_PIPE)
            {
                queue = m_i915Context[numBos - 1];
                MOS_OS_CHK_NULL_RETURN(queue);
                if(-1 != fence)
                {
                    fenceFlag = I915_EXEC_FENCE_IN;
                }

                ret = mos_gem_bo_context_exec3(cmdBos,
                                              numBos,
                                              queue,
                                              nullptr,
                                              0,
                                              dr4,
                                              execFlag | fenceFlag,
                                              &fence);

                for(int i = 0; i < numBos; i++)
                {
                    cmdBos[i] = nullptr;
                }
                numBos = 0;

                if(-1 != fence)
                {
                    close(fence);
                }
            }
        }

        it++;
    }

    return ret;
}

void GpuContextSpecific::IncrementGpuStatusTag()
{
    m_GPUStatusTag = m_GPUStatusTag % UINT_MAX + 1;
    if (m_GPUStatusTag == 0)
    {
        m_GPUStatusTag = 1;
    }
}

void GpuContextSpecific::ResetGpuContextStatus()
{
    MOS_ZeroMemory(m_allocationList, sizeof(ALLOCATION_LIST) * ALLOCATIONLIST_SIZE);
    m_numAllocations = 0;
    MOS_ZeroMemory(m_patchLocationList, sizeof(PATCHLOCATIONLIST) * PATCHLOCATIONLIST_SIZE);
    m_currentNumPatchLocations = 0;

    MOS_ZeroMemory(m_attachedResources, sizeof(MOS_RESOURCE) * ALLOCATIONLIST_SIZE);
    m_resCount = 0;

    MOS_ZeroMemory(m_writeModeList, sizeof(bool) * ALLOCATIONLIST_SIZE);

    if ((m_cmdBufFlushed == true) && m_commandBuffer->OsResource.bo)
    {
        m_commandBuffer->OsResource.bo = nullptr;
    }
}

MOS_STATUS GpuContextSpecific::AllocateGPUStatusBuf()
{
    MOS_OS_FUNCTION_ENTER;

    m_statusBufferMosResource = (MOS_RESOURCE_HANDLE)MOS_AllocAndZeroMemory(sizeof(MOS_RESOURCE));
    MOS_OS_CHK_NULL_RETURN(m_statusBufferMosResource);

    GraphicsResource::CreateParams params;
    params.m_tileType  = MOS_TILE_LINEAR;
    params.m_type      = MOS_GFXRES_BUFFER;
    params.m_format    = Format_Buffer;
    params.m_width     = sizeof(MOS_GPU_STATUS_DATA);
    params.m_height    = 1;
    params.m_depth     = 1;
    params.m_arraySize = 1;
    params.m_name      = "GPU Status Buffer";

    GraphicsResource *graphicsResource = GraphicsResource::CreateGraphicResource(GraphicsResource::osSpecificResource);
    MOS_OS_CHK_NULL_RETURN(graphicsResource);

    MOS_OS_CHK_STATUS_RETURN(graphicsResource->Allocate(m_osContext, params));

    GraphicsResource::LockParams lockParams;
    lockParams.m_writeRequest = true;
    auto gpuStatusData       = (MOS_GPU_STATUS_DATA *)graphicsResource->Lock(m_osContext, lockParams);
    if (gpuStatusData == nullptr)
    {
        MOS_OS_ASSERTMESSAGE("Unable to lock gpu eStatus buffer for read.");
        graphicsResource->Free(m_osContext);
        MOS_Delete(graphicsResource);
        return MOS_STATUS_UNKNOWN;
    }

    m_statusBufferResource = graphicsResource;
    return MOS_STATUS_SUCCESS;
}

#if (_DEBUG || _RELEASE_INTERNAL)
bool GpuContextSpecific::SelectEngineInstanceByUser(struct i915_engine_class_instance *engineMap,
        uint32_t *engineNum, uint32_t userEngineInstance, MOS_GPU_NODE gpuNode)
{
    uint32_t engineInstance     = 0x0;

    if(gpuNode == MOS_GPU_NODE_COMPUTE)
    {
        engineInstance  = (userEngineInstance >> ENGINE_INSTANCE_SELECT_COMPUTE_INSTANCE_SHIFT)
            & (ENGINE_INSTANCE_SELECT_ENABLE_MASK >> (MAX_ENGINE_INSTANCE_NUM - *engineNum));
    }
    else if(gpuNode == MOS_GPU_NODE_VE)
    {
        engineInstance  = (userEngineInstance >> ENGINE_INSTANCE_SELECT_VEBOX_INSTANCE_SHIFT)
            & (ENGINE_INSTANCE_SELECT_ENABLE_MASK >> (MAX_ENGINE_INSTANCE_NUM - *engineNum));
    }
    else if(gpuNode == MOS_GPU_NODE_VIDEO || gpuNode == MOS_GPU_NODE_VIDEO2)
    {
        engineInstance  = (userEngineInstance >> ENGINE_INSTANCE_SELECT_VDBOX_INSTANCE_SHIFT)
            & (ENGINE_INSTANCE_SELECT_ENABLE_MASK >> (MAX_ENGINE_INSTANCE_NUM - *engineNum));
    }
    else
    {
        MOS_OS_NORMALMESSAGE("Invalid gpu node in use.");
    }

    if(engineInstance)
    {
        auto unSelectIndex = 0;
        for(auto bit = 0; bit < *engineNum; bit++)
        {
            if(((engineInstance >> bit) & 0x1) && (bit > unSelectIndex))
            {
                engineMap[unSelectIndex].engine_class = engineMap[bit].engine_class;
                engineMap[unSelectIndex].engine_instance = engineMap[bit].engine_instance;
                engineMap[bit].engine_class = 0;
                engineMap[bit].engine_instance = 0;
                unSelectIndex++;
            }
            else if(((engineInstance >> bit) & 0x1) && (bit == unSelectIndex))
            {
                unSelectIndex++;
            }
            else if(!((engineInstance >> bit) & 0x1))
            {
                engineMap[bit].engine_class = 0;
                engineMap[bit].engine_instance = 0;
            }
        }
        *engineNum = unSelectIndex;
    }
    return engineInstance;
}
#endif
